Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions src/iceberg/manifest_reader_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,8 @@ Result<std::vector<ManifestFile>> ParseManifestList(ArrowSchema* schema,
if (!field.has_value()) {
return InvalidSchema("Field index {} is not found in schema", idx);
}
auto field_name = field.value().get().name();
bool required = !field.value().get().optional();
auto field_name = field.value()->get().name();
bool required = !field.value()->get().optional();
auto view_of_column = array_view.children[idx];
switch (idx) {
case 0:
Expand Down Expand Up @@ -340,8 +340,8 @@ Status ParseDataFile(const std::shared_ptr<StructType>& data_file_schema,
data_file_schema->fields().size(), view_of_column->n_children);
}
for (int64_t col_idx = 0; col_idx < view_of_column->n_children; ++col_idx) {
auto field_name = data_file_schema->GetFieldByIndex(col_idx).value().get().name();
auto required = !data_file_schema->GetFieldByIndex(col_idx).value().get().optional();
auto field_name = data_file_schema->GetFieldByIndex(col_idx).value()->get().name();
auto required = !data_file_schema->GetFieldByIndex(col_idx).value()->get().optional();
auto view_of_file_field = view_of_column->children[col_idx];
auto manifest_entry_count = view_of_file_field->length;

Expand Down Expand Up @@ -487,8 +487,8 @@ Result<std::vector<ManifestEntry>> ParseManifestEntry(ArrowSchema* schema,
if (!field.has_value()) {
return InvalidManifest("Field not found in schema: {}", idx);
}
auto field_name = field.value().get().name();
bool required = !field.value().get().optional();
auto field_name = field.value()->get().name();
bool required = !field.value()->get().optional();
auto view_of_column = array_view.children[idx];

switch (idx) {
Expand All @@ -510,7 +510,7 @@ Result<std::vector<ManifestEntry>> ParseManifestEntry(ArrowSchema* schema,
break;
case 4: {
auto data_file_schema =
dynamic_pointer_cast<StructType>(field.value().get().type());
dynamic_pointer_cast<StructType>(field.value()->get().type());
ICEBERG_RETURN_UNEXPECTED(
ParseDataFile(data_file_schema, view_of_column, manifest_entries));
break;
Expand Down
2 changes: 1 addition & 1 deletion src/iceberg/table_scan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ Result<std::unique_ptr<TableScan>> TableScanBuilder::Build() {
return InvalidArgument("Column {} not found in schema '{}'", column_name,
*schema_id);
}
projected_fields.emplace_back(field_opt.value().get());
projected_fields.emplace_back(field_opt.value()->get());
}
context_.projected_schema =
std::make_shared<Schema>(std::move(projected_fields), schema->schema_id());
Expand Down
137 changes: 98 additions & 39 deletions src/iceberg/type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,18 @@

#include "iceberg/exception.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
#include "iceberg/util/macros.h"
#include "iceberg/util/string_util.h"

namespace iceberg {

StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {
size_t index = 0;
for (const auto& field : fields_) {
auto [it, inserted] = field_id_to_index_.try_emplace(field.field_id(), index);
if (!inserted) {
throw IcebergError(
std::format("StructType: duplicate field ID {} (field indices {} and {})",
field.field_id(), it->second, index));
}

++index;
}
Result<std::optional<NestedType::SchemaFieldConstRef>> NestedType::GetFieldByName(
std::string_view name) const {
return GetFieldByName(name, /*case_sensitive=*/true);
}

StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {}

TypeId StructType::type_id() const { return kTypeId; }

std::string StructType::ToString() const {
Expand All @@ -53,27 +48,34 @@ std::string StructType::ToString() const {
return repr;
}
std::span<const SchemaField> StructType::fields() const { return fields_; }
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldById(
Result<std::optional<NestedType::SchemaFieldConstRef>> StructType::GetFieldById(
int32_t field_id) const {
auto it = field_id_to_index_.find(field_id);
if (it == field_id_to_index_.end()) return std::nullopt;
return fields_[it->second];
ICEBERG_RETURN_UNEXPECTED(InitFieldById());
auto it = field_by_id_.find(field_id);
if (it == field_by_id_.end()) return std::nullopt;
return it->second;
}
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByIndex(
Result<std::optional<NestedType::SchemaFieldConstRef>> StructType::GetFieldByIndex(
int32_t index) const {
if (index < 0 || index >= static_cast<int32_t>(fields_.size())) {
return std::nullopt;
if (index < 0 || static_cast<size_t>(index) >= fields_.size()) {
return InvalidArgument("Invalid index {} to get field from struct", index);
}
return fields_[index];
}
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByName(
std::string_view name) const {
// N.B. duplicate names are not permitted (looking at the Java
// implementation) so there is nothing in particular we need to do here
for (const auto& field : fields_) {
if (field.name() == name) {
return field;
Result<std::optional<NestedType::SchemaFieldConstRef>> StructType::GetFieldByName(
std::string_view name, bool case_sensitive) const {
if (case_sensitive) {
ICEBERG_RETURN_UNEXPECTED(InitFieldByName());
auto it = field_by_name_.find(name);
if (it != field_by_name_.end()) {
return it->second;
}
return std::nullopt;
}
ICEBERG_RETURN_UNEXPECTED(InitFieldByLowerCaseName());
auto it = field_by_lowercase_name_.find(StringUtils::ToLower(name));
if (it != field_by_lowercase_name_.end()) {
return it->second;
}
return std::nullopt;
}
Expand All @@ -84,6 +86,48 @@ bool StructType::Equals(const Type& other) const {
const auto& struct_ = static_cast<const StructType&>(other);
return fields_ == struct_.fields_;
}
Status StructType::InitFieldById() const {
if (!field_by_id_.empty()) {
return {};
}
for (const auto& field : fields_) {
auto it = field_by_id_.try_emplace(field.field_id(), field);
if (!it.second) {
return InvalidSchema("Duplicate field id found: {} (prev name: {}, curr name: {})",
field.field_id(), it.first->second.get().name(), field.name());
}
}
return {};
}
Status StructType::InitFieldByName() const {
if (!field_by_name_.empty()) {
return {};
}
for (const auto& field : fields_) {
auto it = field_by_name_.try_emplace(field.name(), field);
if (!it.second) {
return InvalidSchema("Duplicate field name found: {} (prev id: {}, curr id: {})",
it.first->first, it.first->second.get().field_id(),
field.field_id());
}
}
return {};
}
Status StructType::InitFieldByLowerCaseName() const {
if (!field_by_lowercase_name_.empty()) {
return {};
}
for (const auto& field : fields_) {
auto it =
field_by_lowercase_name_.try_emplace(StringUtils::ToLower(field.name()), field);
if (!it.second) {
return InvalidSchema(
"Duplicate lowercase field name found: {} (prev id: {}, curr id: {})",
it.first->first, it.first->second.get().field_id(), field.field_id());
}
}
return {};
}

ListType::ListType(SchemaField element) : element_(std::move(element)) {
if (element_.name() != kElementName) {
Expand All @@ -105,23 +149,29 @@ std::string ListType::ToString() const {
return repr;
}
std::span<const SchemaField> ListType::fields() const { return {&element_, 1}; }
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldById(
Result<std::optional<NestedType::SchemaFieldConstRef>> ListType::GetFieldById(
int32_t field_id) const {
if (field_id == element_.field_id()) {
return std::cref(element_);
}
return std::nullopt;
}
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByIndex(
Result<std::optional<NestedType::SchemaFieldConstRef>> ListType::GetFieldByIndex(
int index) const {
if (index == 0) {
return std::cref(element_);
}
return std::nullopt;
return InvalidArgument("Invalid index {} to get field from list", index);
}
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByName(
std::string_view name) const {
if (name == element_.name()) {
Result<std::optional<NestedType::SchemaFieldConstRef>> ListType::GetFieldByName(
std::string_view name, bool case_sensitive) const {
if (case_sensitive) {
if (name == kElementName) {
return std::cref(element_);
}
return std::nullopt;
}
if (StringUtils::ToLower(name) == kElementName) {
return std::cref(element_);
}
return std::nullopt;
Expand Down Expand Up @@ -159,7 +209,7 @@ std::string MapType::ToString() const {
return repr;
}
std::span<const SchemaField> MapType::fields() const { return fields_; }
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldById(
Result<std::optional<NestedType::SchemaFieldConstRef>> MapType::GetFieldById(
int32_t field_id) const {
if (field_id == key().field_id()) {
return key();
Expand All @@ -168,20 +218,29 @@ std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldById(
}
return std::nullopt;
}
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByIndex(
Result<std::optional<NestedType::SchemaFieldConstRef>> MapType::GetFieldByIndex(
int32_t index) const {
if (index == 0) {
return key();
} else if (index == 1) {
return value();
}
return std::nullopt;
return InvalidArgument("Invalid index {} to get field from map", index);
}
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByName(
std::string_view name) const {
if (name == kKeyName) {
Result<std::optional<NestedType::SchemaFieldConstRef>> MapType::GetFieldByName(
std::string_view name, bool case_sensitive) const {
if (case_sensitive) {
if (name == kKeyName) {
return key();
} else if (name == kValueName) {
return value();
}
return std::nullopt;
}
const auto lower_case_name = StringUtils::ToLower(name);
if (lower_case_name == kKeyName) {
return key();
} else if (name == kValueName) {
} else if (lower_case_name == kValueName) {
return value();
}
return std::nullopt;
Expand Down
57 changes: 36 additions & 21 deletions src/iceberg/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <vector>

#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/schema_field.h"
#include "iceberg/util/formattable.h"

Expand Down Expand Up @@ -75,23 +76,27 @@ class ICEBERG_EXPORT NestedType : public Type {

/// \brief Get a view of the child fields.
[[nodiscard]] virtual std::span<const SchemaField> fields() const = 0;
using SchemaFieldConstRef = std::reference_wrapper<const SchemaField>;
/// \brief Get a field by field ID.
///
/// \note This is O(1) complexity.
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
GetFieldById(int32_t field_id) const = 0;
[[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldById(
int32_t field_id) const = 0;
/// \brief Get a field by index.
///
/// \note This is O(1) complexity.
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
GetFieldByIndex(int32_t index) const = 0;
/// \brief Get a field by name (case-sensitive). Behavior is undefined if
[[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex(
int32_t index) const = 0;
/// \brief Get a field by name. Return an error Status if
/// the field name is not unique; prefer GetFieldById or GetFieldByIndex
/// when possible.
///
/// \note This is currently O(n) complexity.
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
GetFieldByName(std::string_view name) const = 0;
/// \note This is O(1) complexity.
[[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
std::string_view name, bool case_sensitive) const = 0;
/// \brief Get a field by name (case-sensitive).
[[nodiscard]] Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
std::string_view name) const;
};

/// \defgroup type-nested Nested Types
Expand All @@ -109,18 +114,26 @@ class ICEBERG_EXPORT StructType : public NestedType {
std::string ToString() const override;

std::span<const SchemaField> fields() const override;
std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
Result<std::optional<SchemaFieldConstRef>> GetFieldById(
int32_t field_id) const override;
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex(
int32_t index) const override;
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
std::string_view name) const override;
Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
std::string_view name, bool case_sensitive) const override;
using NestedType::GetFieldByName;

protected:
bool Equals(const Type& other) const override;
// TODO(nullccxsy): Lazy initialization has concurrency issues, need to add proper
// synchronization mechanism
Status InitFieldById() const;
Status InitFieldByName() const;
Status InitFieldByLowerCaseName() const;

std::vector<SchemaField> fields_;
std::unordered_map<int32_t, size_t> field_id_to_index_;
mutable std::unordered_map<int32_t, SchemaFieldConstRef> field_by_id_;
mutable std::unordered_map<std::string_view, SchemaFieldConstRef> field_by_name_;
mutable std::unordered_map<std::string, SchemaFieldConstRef> field_by_lowercase_name_;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to wait for #180 and rebase on it to use StringHash here.

};

/// \brief A data type representing a list of values.
Expand All @@ -140,12 +153,13 @@ class ICEBERG_EXPORT ListType : public NestedType {
std::string ToString() const override;

std::span<const SchemaField> fields() const override;
std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
Result<std::optional<SchemaFieldConstRef>> GetFieldById(
int32_t field_id) const override;
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex(
int32_t index) const override;
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
std::string_view name) const override;
Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
std::string_view name, bool case_sensitive) const override;
using NestedType::GetFieldByName;

protected:
bool Equals(const Type& other) const override;
Expand All @@ -172,12 +186,13 @@ class ICEBERG_EXPORT MapType : public NestedType {
std::string ToString() const override;

std::span<const SchemaField> fields() const override;
std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
Result<std::optional<SchemaFieldConstRef>> GetFieldById(
int32_t field_id) const override;
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex(
int32_t index) const override;
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
std::string_view name) const override;
Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
std::string_view name, bool case_sensitive) const override;
using NestedType::GetFieldByName;

protected:
bool Equals(const Type& other) const override;
Expand Down
11 changes: 4 additions & 7 deletions src/iceberg/util/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,10 @@

#pragma once

#define ICEBERG_RETURN_UNEXPECTED(result) \
do { \
auto&& result_name = (result); \
if (!result_name) [[unlikely]] { \
return std::unexpected<Error>(result_name.error()); \
} \
} while (false);
#define ICEBERG_RETURN_UNEXPECTED(result) \
if (auto&& result_name = result; !result_name) [[unlikely]] { \
return std::unexpected<Error>(result_name.error()); \
}

#define ICEBERG_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
auto&& result_name = (rexpr); \
Expand Down
Loading
Loading