Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions src/iceberg/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,57 @@

#include "iceberg/schema.h"

#include <algorithm>
#include <format>
#include <functional>

#include "iceberg/type.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
#include "iceberg/util/macros.h"
#include "iceberg/util/visit_type.h"

namespace iceberg {

class IdToFieldVisitor {
public:
explicit IdToFieldVisitor(
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
id_to_field);
Status Visit(const PrimitiveType& type);
Status Visit(const NestedType& type);

private:
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field_;
};

class NameToIdVisitor {
public:
explicit NameToIdVisitor(
std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& name_to_id,
bool case_sensitive = true,
std::function<std::string(std::string_view)> quoting_func = {});
Status Visit(const ListType& type, const std::string& path,
const std::string& short_path);
Status Visit(const MapType& type, const std::string& path,
const std::string& short_path);
Status Visit(const StructType& type, const std::string& path,
const std::string& short_path);
Status Visit(const PrimitiveType& type, const std::string& path,
const std::string& short_path);
void Finish();

private:
std::string BuildPath(std::string_view prefix, std::string_view field_name,
bool case_sensitive);

private:
bool case_sensitive_;
std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& name_to_id_;
std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>
short_name_to_id_;
std::function<std::string(std::string_view)> quoting_func_;
};

Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
: StructType(std::move(fields)), schema_id_(schema_id) {}

Expand All @@ -44,4 +88,175 @@ bool Schema::Equals(const Schema& other) const {
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
}

Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
std::string_view name, bool case_sensitive) const {
if (case_sensitive) {
ICEBERG_RETURN_UNEXPECTED(InitNameToIdMap());
auto it = name_to_id_.find(name);
if (it == name_to_id_.end()) return std::nullopt;
return FindFieldById(it->second);
}
ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap());
auto it = lowercase_name_to_id_.find(StringUtils::ToLower(name));
if (it == lowercase_name_to_id_.end()) return std::nullopt;
return FindFieldById(it->second);
}

Status Schema::InitIdToFieldMap() const {
if (!id_to_field_.empty()) {
return {};
}
IdToFieldVisitor visitor(id_to_field_);
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
return {};
}

Status Schema::InitNameToIdMap() const {
if (!name_to_id_.empty()) {
return {};
}
NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true);
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
visitor.Finish();
return {};
}

Status Schema::InitLowerCaseNameToIdMap() const {
if (!lowercase_name_to_id_.empty()) {
return {};
}
NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false);
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
visitor.Finish();
return {};
}

Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldById(
int32_t field_id) const {
ICEBERG_RETURN_UNEXPECTED(InitIdToFieldMap());
auto it = id_to_field_.find(field_id);
if (it == id_to_field_.end()) {
return std::nullopt;
}
return it->second;
}

IdToFieldVisitor::IdToFieldVisitor(
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field)
: id_to_field_(id_to_field) {}

Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; }

Status IdToFieldVisitor::Visit(const NestedType& type) {
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
const auto& fields = nested.fields();
for (const auto& field : fields) {
auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field));
if (!it.second) {
return InvalidSchema("Duplicate field id found: {}", field.field_id());
}
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
}
return {};
}

NameToIdVisitor::NameToIdVisitor(
std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& name_to_id,
bool case_sensitive, std::function<std::string(std::string_view)> quoting_func)
: name_to_id_(name_to_id),
case_sensitive_(case_sensitive),
quoting_func_(std::move(quoting_func)) {}

Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
const std::string& short_path) {
const auto& field = type.fields()[0];
std::string new_path = BuildPath(path, field.name(), case_sensitive_);
std::string new_short_path;
if (field.type()->type_id() == TypeId::kStruct) {
new_short_path = short_path;
} else {
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
}
auto it = name_to_id_.try_emplace(new_path, field.field_id());
if (!it.second) {
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
it.first->first, it.first->second, field.field_id());
}
short_name_to_id_.try_emplace(new_short_path, field.field_id());
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*field.type(), this, new_path, new_short_path));
return {};
}

Status NameToIdVisitor::Visit(const MapType& type, const std::string& path,
const std::string& short_path) {
std::string new_path, new_short_path;
const auto& fields = type.fields();
for (const auto& field : fields) {
new_path = BuildPath(path, field.name(), case_sensitive_);
if (field.name() == MapType::kValueName &&
field.type()->type_id() == TypeId::kStruct) {
new_short_path = short_path;
} else {
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
}
auto it = name_to_id_.try_emplace(new_path, field.field_id());
if (!it.second) {
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
it.first->first, it.first->second, field.field_id());
}
short_name_to_id_.try_emplace(new_short_path, field.field_id());
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*field.type(), this, new_path, new_short_path));
}
return {};
}

Status NameToIdVisitor::Visit(const StructType& type, const std::string& path,
const std::string& short_path) {
const auto& fields = type.fields();
std::string new_path, new_short_path;
for (const auto& field : fields) {
new_path = BuildPath(path, field.name(), case_sensitive_);
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
auto it = name_to_id_.try_emplace(new_path, field.field_id());
if (!it.second) {
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
it.first->first, it.first->second, field.field_id());
}
short_name_to_id_.try_emplace(new_short_path, field.field_id());
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*field.type(), this, new_path, new_short_path));
}
return {};
}

Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path,
const std::string& short_path) {
return {};
}

std::string NameToIdVisitor::BuildPath(std::string_view prefix,
std::string_view field_name, bool case_sensitive) {
std::string quoted_name;
if (!quoting_func_) {
quoted_name = std::string(field_name);
} else {
quoted_name = quoting_func_(field_name);
}
if (case_sensitive) {
return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name;
}
return prefix.empty() ? StringUtils::ToLower(quoted_name)
: std::string(prefix) + "." + StringUtils::ToLower(quoted_name);
}

void NameToIdVisitor::Finish() {
for (auto&& it : short_name_to_id_) {
name_to_id_.try_emplace(it.first, it.second);
}
}

} // namespace iceberg
35 changes: 35 additions & 0 deletions src/iceberg/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
#include <vector>

#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/schema_field.h"
#include "iceberg/type.h"
#include "iceberg/util/string_util.h"

namespace iceberg {

Expand All @@ -54,13 +56,46 @@ class ICEBERG_EXPORT Schema : public StructType {

[[nodiscard]] std::string ToString() const override;

/// \brief Find the SchemaField by field name.
///
/// Short names for maps and lists are included for any name that does not conflict with
/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
/// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', if its
/// value include a structs with field 'x' wil produce short name 'm.x' in addition to
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to support
/// non-ASCII characters (e.g., using std::towlower or ICU)
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(std::string_view name, bool case_sensitive = true) const;

/// \brief Find the SchemaField by field id.
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(int32_t field_id) const;

friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }

private:
/// Mapping from field id to field.
mutable std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>
id_to_field_;
/// Mapping from field name to field id.
mutable std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>
name_to_id_;
/// Mapping from lowercased field name to field id
mutable std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>
lowercase_name_to_id_;

private:
/// \brief Compare two schemas for equality.
[[nodiscard]] bool Equals(const Schema& other) const;

const std::optional<int32_t> schema_id_;

// TODO(nullccxsy): Address potential concurrency issues in lazy initialization (e.g.,
// use std::call_once)
Status InitIdToFieldMap() const;
Status InitNameToIdMap() const;
Status InitLowerCaseNameToIdMap() const;
};

} // namespace iceberg
11 changes: 7 additions & 4 deletions src/iceberg/util/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@

#pragma once

#define ICEBERG_RETURN_UNEXPECTED(result) \
if (!result) [[unlikely]] { \
return std::unexpected<Error>(result.error()); \
}
#define ICEBERG_RETURN_UNEXPECTED(result) \
do { \
auto&& iceberg_temp_result = (result); \
if (!iceberg_temp_result) [[unlikely]] { \
return std::unexpected<Error>(iceberg_temp_result.error()); \
} \
} while (false);

#define ICEBERG_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
auto&& result_name = (rexpr); \
Expand Down
13 changes: 13 additions & 0 deletions src/iceberg/util/string_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,17 @@ class ICEBERG_EXPORT StringUtils {
}
};

/// \brief Transparent hash function that supports std::string_view as lookup key
///
/// Enables std::unordered_map to directly accept std::string_view lookup keys
/// without creating temporary std::string objects, using C++20's transparent lookup.
struct ICEBERG_EXPORT string_hash {
using hash_type = std::hash<std::string_view>;
using is_transparent = void;

std::size_t operator()(std::string_view str) const { return hash_type{}(str); }
std::size_t operator()(const char* str) const { return hash_type{}(str); }
std::size_t operator()(const std::string& str) const { return hash_type{}(str); }
};

} // namespace iceberg
Loading
Loading