From 3846d968dc0b3f3f79c21d3b06db56cf8b8cb267 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 15 Jan 2025 20:16:39 -0500 Subject: [PATCH 01/13] Add headers for type/field/schema --- docs/Doxyfile | 4 +- src/iceberg/CMakeLists.txt | 2 +- src/iceberg/schema.cc | 47 ++++ src/iceberg/schema.h | 64 ++++++ src/iceberg/schema_field.cc | 64 ++++++ src/iceberg/schema_field.h | 87 ++++++++ src/iceberg/type.cc | 297 ++++++++++++++++++++++++ src/iceberg/type.h | 397 +++++++++++++++++++++++++++++++++ src/iceberg/type_fwd.h | 63 ++++++ src/iceberg/util/formattable.h | 44 ++++ src/iceberg/util/formatter.h | 42 ++++ 11 files changed, 1108 insertions(+), 3 deletions(-) create mode 100644 src/iceberg/schema.cc create mode 100644 src/iceberg/schema.h create mode 100644 src/iceberg/schema_field.cc create mode 100644 src/iceberg/schema_field.h create mode 100644 src/iceberg/type.cc create mode 100644 src/iceberg/type.h create mode 100644 src/iceberg/type_fwd.h create mode 100644 src/iceberg/util/formattable.h create mode 100644 src/iceberg/util/formatter.h diff --git a/docs/Doxyfile b/docs/Doxyfile index d576080fe..75d69394e 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -195,7 +195,7 @@ INLINE_INHERITED_MEMB = NO # shortest path that makes the file name unique will be used # The default value is: YES. -FULL_PATH_NAMES = NO +FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand @@ -207,7 +207,7 @@ FULL_PATH_NAMES = NO # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = ../src # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 4bebe4e4e..91706dc87 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -set(ICEBERG_SOURCES demo_table.cc) +set(ICEBERG_SOURCES demo_table.cc schema.cc schema_field.cc type.cc) add_iceberg_lib(iceberg SOURCES diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc new file mode 100644 index 000000000..12874d68d --- /dev/null +++ b/src/iceberg/schema.cc @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/schema.h" + +#include + +#include "iceberg/type.h" +#include "iceberg/util/formatter.h" + +namespace iceberg { + +Schema::Schema(int32_t schema_id, std::vector fields) + : StructType(std::move(fields)), schema_id_(schema_id) {} + +int32_t Schema::schema_id() const { return schema_id_; } + +std::string Schema::ToString() const { + std::string repr = "schema<"; + for (const auto& field : fields_) { + std::format_to(std::back_inserter(repr), " {}\n", field); + } + repr += ">"; + return repr; +} + +bool Schema::Equals(const Schema& other) const { + return schema_id_ == other.schema_id_ && fields_ == other.fields_; +} + +} // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h new file mode 100644 index 000000000..43ec9e16a --- /dev/null +++ b/src/iceberg/schema.h @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/schema.h +/// Schemas for Iceberg tables. This header contains the definition of Schema +/// and any utility functions. See iceberg/type.h and iceberg/field.h as well. + +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/schema_field.h" +#include "iceberg/type.h" + +namespace iceberg { + +/// \brief A schema for a Table. +/// +/// A schema is a list of typed columns, along with a unique integer ID. A +/// Table may have different schemas over its lifetime due to schema +/// evolution. +class ICEBERG_EXPORT Schema : public StructType { + public: + Schema(int32_t schema_id, std::vector fields); + + /// \brief Get the schema ID. + /// + /// Schemas are identified by a unique ID for the purposes of schema + /// evolution. + [[nodiscard]] int32_t schema_id() const; + + [[nodiscard]] std::string ToString() const; + + friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } + + friend bool operator!=(const Schema& lhs, const Schema& rhs) { return !(lhs == rhs); } + + private: + /// \brief Compare two schemas for equality. + [[nodiscard]] bool Equals(const Schema& other) const; + + const int32_t schema_id_; +}; + +} // namespace iceberg diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc new file mode 100644 index 000000000..4de00b87a --- /dev/null +++ b/src/iceberg/schema_field.cc @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/schema_field.h" + +#include + +#include "iceberg/type.h" +#include "iceberg/util/formatter.h" + +namespace iceberg { + +SchemaField::SchemaField(int32_t field_id, std::string name, std::shared_ptr type, + bool optional) + : field_id_(field_id), + name_(std::move(name)), + type_(std::move(type)), + optional_(optional) {} + +SchemaField SchemaField::MakeOptional(int32_t field_id, std::string name, + std::shared_ptr type) { + return SchemaField(field_id, std::move(name), std::move(type), true); +} + +SchemaField SchemaField::MakeRequired(int32_t field_id, std::string name, + std::shared_ptr type) { + return SchemaField(field_id, std::move(name), std::move(type), false); +} + +int32_t SchemaField::field_id() const { return field_id_; } + +std::string_view SchemaField::name() const { return name_; } + +const std::shared_ptr& SchemaField::type() const { return type_; } + +bool SchemaField::optional() const { return optional_; } + +std::string SchemaField::ToString() const { + return std::format("{} ({}): {}{}", name_, field_id_, *type_, + optional_ ? "" : " (required)"); +} + +bool SchemaField::Equals(const SchemaField& other) const { + return field_id_ == other.field_id_ && name_ == other.name_ && *type_ == *other.type_ && + optional_ == other.optional_; +} + +} // namespace iceberg diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h new file mode 100644 index 000000000..e37c2d2d8 --- /dev/null +++ b/src/iceberg/schema_field.h @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/schema_field.h +/// A (schema) field is a name and a type and is part of a schema or nested +/// type (e.g. a struct). + +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/type_fwd.h" +#include "iceberg/util/formattable.h" + +namespace iceberg { + +/// \brief A type combined with a name. +class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { + public: + /// \brief Construct a field. + /// \param[in] field_id The field ID. + /// \param[in] name The field name. + /// \param[in] type The field type. + /// \param[in] optional Whether values of this field are required or nullable. + SchemaField(int32_t field_id, std::string name, std::shared_ptr type, + bool optional); + + /// \brief Construct an optional (nullable) field. + static SchemaField MakeOptional(int32_t field_id, std::string name, + std::shared_ptr type); + /// \brief Construct a required (non-null) field. + static SchemaField MakeRequired(int32_t field_id, std::string name, + std::shared_ptr type); + + /// \brief Get the field ID. + [[nodiscard]] int32_t field_id() const; + + /// \brief Get the field name. + [[nodiscard]] std::string_view name() const; + + /// \brief Get the field type. + [[nodiscard]] const std::shared_ptr& type() const; + + /// \brief Get whether the field is optional. + [[nodiscard]] bool optional() const; + + [[nodiscard]] std::string ToString() const; + + friend bool operator==(const SchemaField& lhs, const SchemaField& rhs) { + return lhs.Equals(rhs); + } + + friend bool operator!=(const SchemaField& lhs, const SchemaField& rhs) { + return !(lhs == rhs); + } + + private: + /// \brief Compare two fields for equality. + [[nodiscard]] bool Equals(const SchemaField& other) const; + + int32_t field_id_; + std::string name_; + std::shared_ptr type_; + bool optional_; +}; + +} // namespace iceberg diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc new file mode 100644 index 000000000..1132fce67 --- /dev/null +++ b/src/iceberg/type.cc @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/type.h" + +#include +#include +#include + +#include "iceberg/util/formatter.h" + +namespace iceberg { + +TypeId BooleanType::type_id() const { return TypeId::kBoolean; } +std::string BooleanType::ToString() const { return "boolean"; } +bool BooleanType::Equals(const Type& other) const { + return other.type_id() == TypeId::kBoolean; +} + +TypeId Int32Type::type_id() const { return TypeId::kInt32; } +std::string Int32Type::ToString() const { return "int32"; } +bool Int32Type::Equals(const Type& other) const { + return other.type_id() == TypeId::kInt32; +} + +TypeId Int64Type::type_id() const { return TypeId::kInt64; } +std::string Int64Type::ToString() const { return "int64"; } +bool Int64Type::Equals(const Type& other) const { + return other.type_id() == TypeId::kInt64; +} + +TypeId Float32Type::type_id() const { return TypeId::kFloat32; } +std::string Float32Type::ToString() const { return "float32"; } +bool Float32Type::Equals(const Type& other) const { + return other.type_id() == TypeId::kFloat32; +} + +TypeId Float64Type::type_id() const { return TypeId::kFloat64; } +std::string Float64Type::ToString() const { return "float64"; } +bool Float64Type::Equals(const Type& other) const { + return other.type_id() == TypeId::kFloat64; +} + +DecimalType::DecimalType(int32_t precision, int32_t scale) + : precision_(precision), scale_(scale) { + if (precision < 0 || precision > kMaxPrecision) { + throw std::runtime_error( + std::format("DecimalType: precision must be in [0, 38], was {}", precision)); + } +} + +int32_t DecimalType::precision() const { return precision_; } +int32_t DecimalType::scale() const { return scale_; } +TypeId DecimalType::type_id() const { return TypeId::kDecimal; } +std::string DecimalType::ToString() const { + return std::format("decimal({}, {})", precision_, scale_); +} +bool DecimalType::Equals(const Type& other) const { + if (other.type_id() != TypeId::kDecimal) { + return false; + } + const auto& decimal = static_cast(other); + return precision_ == decimal.precision_ && scale_ == decimal.scale_; +} + +TypeId TimeType::type_id() const { return TypeId::kTime; } +std::string TimeType::ToString() const { return "time"; } +bool TimeType::Equals(const Type& other) const { + return other.type_id() == TypeId::kTime; +} + +TypeId DateType::type_id() const { return TypeId::kDate; } +std::string DateType::ToString() const { return "date"; } +bool DateType::Equals(const Type& other) const { + return other.type_id() == TypeId::kDate; +} + +TypeId TimestampType::type_id() const { return TypeId::kTimestamp; } +std::string TimestampType::ToString() const { return "timestamp"; } +bool TimestampType::Equals(const Type& other) const { + return other.type_id() == TypeId::kTimestamp; +} + +TypeId TimestampTzType::type_id() const { return TypeId::kTimestampTz; } +std::string TimestampTzType::ToString() const { return "timestamptz"; } +bool TimestampTzType::Equals(const Type& other) const { + return other.type_id() == TypeId::kTimestampTz; +} + +TypeId BinaryType::type_id() const { return TypeId::kBinary; } +std::string BinaryType::ToString() const { return "binary"; } +bool BinaryType::Equals(const Type& other) const { + return other.type_id() == TypeId::kBinary; +} + +TypeId StringType::type_id() const { return TypeId::kString; } +std::string StringType::ToString() const { return "string"; } +bool StringType::Equals(const Type& other) const { + return other.type_id() == TypeId::kString; +} + +FixedType::FixedType(int32_t length) : length_(length) { + if (length < 0) { + throw std::runtime_error( + std::format("FixedType: length must be >= 0, was {}", length)); + } +} + +int32_t FixedType::length() const { return length_; } +TypeId FixedType::type_id() const { return TypeId::kFixed; } +std::string FixedType::ToString() const { return std::format("fixed({})", length_); } +bool FixedType::Equals(const Type& other) const { + if (other.type_id() != TypeId::kFixed) { + return false; + } + const auto& fixed = static_cast(other); + return length_ == fixed.length_; +} + +TypeId UuidType::type_id() const { return TypeId::kUuid; } +std::string UuidType::ToString() const { return "uuid"; } +bool UuidType::Equals(const Type& other) const { + return other.type_id() == TypeId::kUuid; +} + +ListType::ListType(SchemaField element) : element_(std::move(element)) { + if (element_.name() != kElementName) { + throw std::runtime_error( + std::format("ListType: child field name should be '{}', was '{}'", kElementName, + element_.name())); + } +} + +ListType::ListType(int32_t field_id, std::shared_ptr type, bool optional) + : element_(field_id, std::string(kElementName), std::move(type), optional) {} + +TypeId ListType::type_id() const { return TypeId::kList; } +std::string ListType::ToString() const { return std::format("list<{}>", element_); } +std::span ListType::fields() const { return {&element_, 1}; } +std::optional> ListType::GetFieldById( + int32_t field_id) const { + if (field_id == element_.field_id()) { + return std::cref(element_); + } + return std::nullopt; +} +std::optional> ListType::GetFieldByIndex( + int index) const { + if (index == 0) { + return std::cref(element_); + } + return std::nullopt; +} +std::optional> ListType::GetFieldByName( + std::string_view name) const { + if (name == element_.name()) { + return std::cref(element_); + } + return std::nullopt; +} +bool ListType::Equals(const Type& other) const { + if (other.type_id() != TypeId::kList) { + return false; + } + const auto& list = static_cast(other); + return element_ == list.element_; +} + +MapType::MapType(SchemaField key, SchemaField value) + : fields_{std::move(key), std::move(value)} { + if (this->key().name() != kKeyName) { + throw std::runtime_error( + std::format("MapType: key field name should be '{}', was '{}'", kKeyName, + this->key().name())); + } + if (this->value().name() != kValueName) { + throw std::runtime_error( + std::format("MapType: value field name should be '{}', was '{}'", kValueName, + this->value().name())); + } +} + +const SchemaField& MapType::key() const { return fields_[0]; } +const SchemaField& MapType::value() const { return fields_[1]; } +TypeId MapType::type_id() const { return TypeId::kMap; } +std::string MapType::ToString() const { + return std::format("map<{}: {}>", key(), value()); +} +std::span MapType::fields() const { return fields_; } +std::optional> MapType::GetFieldById( + int32_t field_id) const { + if (field_id == key().field_id()) { + return key(); + } else if (field_id == value().field_id()) { + return value(); + } + return std::nullopt; +} +std::optional> MapType::GetFieldByIndex( + int index) const { + if (index == 0) { + return key(); + } else if (index == 0) { + return value(); + } + return std::nullopt; +} +std::optional> MapType::GetFieldByName( + std::string_view name) const { + if (name == kKeyName) { + return key(); + } else if (name == kValueName) { + return value(); + } + return std::nullopt; +} +bool MapType::Equals(const Type& other) const { + if (other.type_id() != TypeId::kMap) { + return false; + } + const auto& map = static_cast(other); + return fields_ == map.fields_; +} + +StructType::StructType(std::vector fields) : fields_(std::move(fields)) { + size_t index = 0; + for (const auto& field : fields_) { + auto [it, inserted] = field_id_to_index_.try_emplace(field.field_id(), index); + if (!inserted) { + throw std::runtime_error( + std::format("StructType: duplicate field ID {} (field indices {} and {})", + field.field_id(), it->second, index)); + } + + index++; + } +} + +TypeId StructType::type_id() const { return TypeId::kStruct; } +std::string StructType::ToString() const { + std::string repr = "struct<\n"; + for (const auto& field : fields_) { + std::format_to(std::back_inserter(repr), " {}\n", field); + } + repr += ">"; + return repr; +} +std::span StructType::fields() const { return fields_; } +std::optional> StructType::GetFieldById( + int32_t field_id) const { + auto it = field_id_to_index_.find(field_id); + if (it == field_id_to_index_.end()) return std::nullopt; + return fields_[it->second]; +} +std::optional> StructType::GetFieldByIndex( + int index) const { + if (index < 0 || index >= static_cast(fields_.size())) { + return std::nullopt; + } + return fields_[index]; +} +std::optional> StructType::GetFieldByName( + std::string_view name) const { + // TODO: what is the right behavior if there are duplicate names? (Are + // duplicate names permitted?) + for (const auto& field : fields_) { + if (field.name() == name) { + return field; + } + } + return std::nullopt; +} +bool StructType::Equals(const Type& other) const { + if (other.type_id() != TypeId::kStruct) { + return false; + } + const auto& struct_ = static_cast(other); + return fields_ == struct_.fields_; +} + +} // namespace iceberg diff --git a/src/iceberg/type.h b/src/iceberg/type.h new file mode 100644 index 000000000..3910b2314 --- /dev/null +++ b/src/iceberg/type.h @@ -0,0 +1,397 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/type.h +/// Data types for Iceberg. This header defines the data types, but see +/// iceberg/type_fwd.h for the enum defining the list of types. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/schema_field.h" +#include "iceberg/util/formattable.h" + +namespace iceberg { + +/// \brief Interface for a data type for a field. +class ICEBERG_EXPORT Type : public iceberg::util::Formattable { + public: + virtual ~Type() = default; + + /// \brief Get the type ID. + [[nodiscard]] virtual TypeId type_id() const = 0; + + /// \brief Is this a primitive type (may not have child fields)? + [[nodiscard]] virtual bool is_primitive() const = 0; + + /// \brief Is this a nested type (may have child fields)? + [[nodiscard]] virtual bool is_nested() const = 0; + + /// \brief Compare two types for equality. + friend bool operator==(const Type& lhs, const Type& rhs) { return lhs.Equals(rhs); } + + /// \brief Compare two types for inequality. + friend bool operator!=(const Type& lhs, const Type& rhs) { return !(lhs == rhs); } + + protected: + /// \brief Compare two types for equality. + [[nodiscard]] virtual bool Equals(const Type& other) const = 0; +}; + +/// \brief A data type that may not have child fields. +class ICEBERG_EXPORT PrimitiveType : public Type { + public: + bool is_primitive() const override { return true; } + bool is_nested() const override { return false; } +}; + +/// \brief A data type that may have child fields. +class ICEBERG_EXPORT NestedType : public Type { + public: + bool is_primitive() const override { return false; } + bool is_nested() const override { return true; } + + /// \brief Get a view of the child fields. + [[nodiscard]] virtual std::span fields() const = 0; + /// \brief Get a field by field ID. + [[nodiscard]] virtual std::optional> + GetFieldById(int32_t field_id) const = 0; + /// \brief Get a field by index. + [[nodiscard]] virtual std::optional> + GetFieldByIndex(int i) const = 0; + /// \brief Get a field by name. + [[nodiscard]] virtual std::optional> + GetFieldByName(std::string_view name) const = 0; +}; + +/// \defgroup type-primitive Primitive Types +/// Primitive types do not have nested fields. +/// @{ + +/// \brief A data type representing a boolean. +class ICEBERG_EXPORT BooleanType : public PrimitiveType { + public: + BooleanType() = default; + ~BooleanType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a 32-bit signed integer. +class ICEBERG_EXPORT Int32Type : public PrimitiveType { + public: + Int32Type() = default; + ~Int32Type() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a 64-bit signed integer. +class ICEBERG_EXPORT Int64Type : public PrimitiveType { + public: + Int64Type() = default; + ~Int64Type() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a 32-bit (single precision) float. +class ICEBERG_EXPORT Float32Type : public PrimitiveType { + public: + Float32Type() = default; + ~Float32Type() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a 64-bit (double precision) float. +class ICEBERG_EXPORT Float64Type : public PrimitiveType { + public: + Float64Type() = default; + ~Float64Type() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a fixed-precision decimal. +class ICEBERG_EXPORT DecimalType : public PrimitiveType { + public: + constexpr static const int32_t kMaxPrecision = 38; + + /// \brief Construct a decimal type with the given precision and scale. + DecimalType(int32_t precision, int32_t scale); + ~DecimalType() = default; + + /// \brief Get the precision (the number of decimal digits). + [[nodiscard]] int32_t precision() const; + /// \brief Get the scale (essentially, the number of decimal digits after + /// the decimal point; precisely, the value is scaled by $$10^{-s}$$.). + [[nodiscard]] int32_t scale() const; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; + + private: + int32_t precision_; + int32_t scale_; +}; + +/// \brief A data type representing a calendar date without reference to a +/// timezone or time. +class ICEBERG_EXPORT DateType : public PrimitiveType { + public: + DateType() = default; + ~DateType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a wall clock time in microseconds without +/// reference to a timezone or date. +class ICEBERG_EXPORT TimeType : public PrimitiveType { + public: + TimeType() = default; + ~TimeType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a timestamp in microseconds without +/// reference to a timezone. +class ICEBERG_EXPORT TimestampType : public PrimitiveType { + public: + TimestampType() = default; + ~TimestampType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a timestamp in microseconds in a +/// particular timezone. +class ICEBERG_EXPORT TimestampTzType : public PrimitiveType { + public: + TimestampTzType() = default; + ~TimestampTzType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a bytestring. +class ICEBERG_EXPORT BinaryType : public PrimitiveType { + public: + BinaryType() = default; + ~BinaryType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a string. +class ICEBERG_EXPORT StringType : public PrimitiveType { + public: + StringType() = default; + ~StringType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// \brief A data type representing a fixed-length bytestring. +class ICEBERG_EXPORT FixedType : public PrimitiveType { + public: + /// \brief Construct a fixed type with the given length. + FixedType(int32_t length); + ~FixedType() = default; + + /// \brief The length (the number of bytes to store). + [[nodiscard]] int32_t length() const; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; + + private: + int32_t length_; +}; + +/// \brief A data type representing a UUID. While defined as a distinct type, +/// it is effectively a fixed(16). +class ICEBERG_EXPORT UuidType : public PrimitiveType { + public: + UuidType() = default; + ~UuidType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// @} + +/// \defgroup type-nested Nested Types +/// Nested types have nested fields. +/// @{ + +/// \brief A data type representing a list of values. +class ICEBERG_EXPORT ListType : public NestedType { + public: + constexpr static const std::string_view kElementName = "element"; + + /// \brief Construct a list of the given element. The name of the child + /// field should be "element". + explicit ListType(SchemaField element); + /// \brief Construct a list of the given element type. + ListType(int32_t field_id, std::shared_ptr type, bool optional); + ~ListType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + std::span fields() const override; + std::optional> GetFieldById( + int32_t field_id) const override; + std::optional> GetFieldByIndex( + int i) const override; + std::optional> GetFieldByName( + std::string_view name) const override; + + protected: + bool Equals(const Type& other) const override; + + SchemaField element_; +}; + +/// \brief A data type representing a dictionary of values. +class ICEBERG_EXPORT MapType : public NestedType { + public: + constexpr static const std::string_view kKeyName = "key"; + constexpr static const std::string_view kValueName = "value"; + + /// \brief Construct a map of the given key/value fields. The field names + /// should be "key" and "value", respectively. + explicit MapType(SchemaField key, SchemaField value); + ~MapType() = default; + + const SchemaField& key() const; + const SchemaField& value() const; + + TypeId type_id() const override; + std::string ToString() const override; + + std::span fields() const override; + std::optional> GetFieldById( + int32_t field_id) const override; + std::optional> GetFieldByIndex( + int i) const override; + std::optional> GetFieldByName( + std::string_view name) const override; + + protected: + bool Equals(const Type& other) const override; + + std::array fields_; +}; + +/// \brief A data type representing a struct with nested fields. +class ICEBERG_EXPORT StructType : public NestedType { + public: + explicit StructType(std::vector fields); + ~StructType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + std::span fields() const override; + std::optional> GetFieldById( + int32_t field_id) const override; + std::optional> GetFieldByIndex( + int i) const override; + std::optional> GetFieldByName( + std::string_view name) const override; + + protected: + bool Equals(const Type& other) const override; + + std::vector fields_; + std::unordered_map field_id_to_index_; +}; + +/// @} + +// TODO: need to specialize std::format (ideally via a trait?) + +} // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h new file mode 100644 index 000000000..b05bb7d3f --- /dev/null +++ b/src/iceberg/type_fwd.h @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/type_fwd.h +/// Forward declarations and enum definitions. When writing your own headers, +/// you can include this instead of the "full" headers to help reduce compile +/// times. + +namespace iceberg { + +/// \brief A data type. +/// +/// This is not a complete data type by itself because some types are nested +/// and/or parameterized. +/// +/// Iceberg V3 types are not currently supported. +enum class TypeId { + kBoolean, + kInt32, + kInt64, + kFloat32, + kFloat64, + kDecimal, + kDate, + kTime, + kTimestamp, + kTimestampTz, + kBinary, + kString, + kFixed, + kUuid, + kStruct, + kList, + kMap, +}; + +class BooleanType; +class SchemaField; +class NestedType; +class PrimitiveType; +class Schema; +class StructType; +class Type; + +} // namespace iceberg diff --git a/src/iceberg/util/formattable.h b/src/iceberg/util/formattable.h new file mode 100644 index 000000000..422c5a921 --- /dev/null +++ b/src/iceberg/util/formattable.h @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/util/formattable.h +/// Interface for objects that can be formatted via std::format. The actual +/// std::formatter specialization is in iceberg/util/formatter.h to avoid +/// bringing in unnecessarily. + +#include + +#include "iceberg/iceberg_export.h" + +namespace iceberg::util { + +/// \brief Interface for objects that can be formatted via std::format. +/// +/// You must include iceberg/util/formatter.h when calling std::format. +class ICEBERG_EXPORT Formattable { + public: + virtual ~Formattable() = default; + + /// \brief Get a user-readable string representation. + virtual std::string ToString() const = 0; +}; + +} // namespace iceberg::util diff --git a/src/iceberg/util/formatter.h b/src/iceberg/util/formatter.h new file mode 100644 index 000000000..42817fdc8 --- /dev/null +++ b/src/iceberg/util/formatter.h @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/util/formatter.h +/// A specialization of std::formatter for Formattable objects. This header +/// is separate from iceberg/util/formattable.h so that the latter (which is +/// meant to be included widely) does not leak unnecessarily into +/// other headers. + +#include +#include +#include +#include + +#include "iceberg/util/formattable.h" + +/// \brief Make all classes deriving from iceberg::util::Formattable +/// formattable with std::format. +template Derived> +struct std::formatter : std::formatter { + auto format(const iceberg::util::Formattable& obj, std::format_context& ctx) const { + return std::formatter::format(obj.ToString(), ctx); + } +}; From ce37966edd232e6b26c4f4d11e2ec337031e0dad Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 22 Jan 2025 15:18:00 +0900 Subject: [PATCH 02/13] fix clang --- src/iceberg/type.cc | 17 +++++++++++++++-- src/iceberg/util/formatter.h | 5 +++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index 1132fce67..1b664b291 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -151,7 +151,14 @@ ListType::ListType(int32_t field_id, std::shared_ptr type, bool optional) : element_(field_id, std::string(kElementName), std::move(type), optional) {} TypeId ListType::type_id() const { return TypeId::kList; } -std::string ListType::ToString() const { return std::format("list<{}>", element_); } +std::string ListType::ToString() const { + // XXX: work around Clang/libc++: "<{}>" in a format string appears to get + // parsed as {<>} or something; split up the format string to avoid that + std::string repr = "list<"; + std::format_to(std::back_inserter(repr), "{}", element_); + repr += ">"; + return repr; +} std::span ListType::fields() const { return {&element_, 1}; } std::optional> ListType::GetFieldById( int32_t field_id) const { @@ -200,7 +207,13 @@ const SchemaField& MapType::key() const { return fields_[0]; } const SchemaField& MapType::value() const { return fields_[1]; } TypeId MapType::type_id() const { return TypeId::kMap; } std::string MapType::ToString() const { - return std::format("map<{}: {}>", key(), value()); + // XXX: work around Clang/libc++: "<{}>" in a format string appears to get + // parsed as {<>} or something; split up the format string to avoid that + std::string repr = "map<"; + + std::format_to(std::back_inserter(repr), "{}: {}", key(), value()); + repr += "}"; + return repr; } std::span MapType::fields() const { return fields_; } std::optional> MapType::GetFieldById( diff --git a/src/iceberg/util/formatter.h b/src/iceberg/util/formatter.h index 42817fdc8..ed1c5cbaa 100644 --- a/src/iceberg/util/formatter.h +++ b/src/iceberg/util/formatter.h @@ -33,10 +33,11 @@ #include "iceberg/util/formattable.h" /// \brief Make all classes deriving from iceberg::util::Formattable -/// formattable with std::format. +/// formattable with std::format. template Derived> struct std::formatter : std::formatter { - auto format(const iceberg::util::Formattable& obj, std::format_context& ctx) const { + template + auto format(const iceberg::util::Formattable& obj, FormatContext& ctx) const { return std::formatter::format(obj.ToString(), ctx); } }; From dd166d358d756df8d63a2a7dd6b8a2a0bb916600 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 22 Jan 2025 06:25:54 -0500 Subject: [PATCH 03/13] updates --- src/iceberg/type.cc | 8 ++++++-- src/iceberg/type.h | 32 ++++++++++++++++++++++++-------- src/iceberg/type_fwd.h | 5 +++++ 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index 1b664b291..d44d554a0 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -91,12 +91,16 @@ bool DateType::Equals(const Type& other) const { return other.type_id() == TypeId::kDate; } +bool TimestampType::is_zoned() const { return false; } +TimeUnit TimestampType::time_unit() const { return TimeUnit::kMicrosecond; } TypeId TimestampType::type_id() const { return TypeId::kTimestamp; } std::string TimestampType::ToString() const { return "timestamp"; } bool TimestampType::Equals(const Type& other) const { return other.type_id() == TypeId::kTimestamp; } +bool TimestampTzType::is_zoned() const { return true; } +TimeUnit TimestampTzType::time_unit() const { return TimeUnit::kMicrosecond; } TypeId TimestampTzType::type_id() const { return TypeId::kTimestampTz; } std::string TimestampTzType::ToString() const { return "timestamptz"; } bool TimestampTzType::Equals(const Type& other) const { @@ -226,7 +230,7 @@ std::optional> MapType::GetFieldById( return std::nullopt; } std::optional> MapType::GetFieldByIndex( - int index) const { + int32_t index) const { if (index == 0) { return key(); } else if (index == 0) { @@ -282,7 +286,7 @@ std::optional> StructType::GetFieldByI return fields_[it->second]; } std::optional> StructType::GetFieldByIndex( - int index) const { + int32_t index) const { if (index < 0 || index >= static_cast(fields_.size())) { return std::nullopt; } diff --git a/src/iceberg/type.h b/src/iceberg/type.h index 3910b2314..aba37f72b 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -83,7 +83,7 @@ class ICEBERG_EXPORT NestedType : public Type { GetFieldById(int32_t field_id) const = 0; /// \brief Get a field by index. [[nodiscard]] virtual std::optional> - GetFieldByIndex(int i) const = 0; + GetFieldByIndex(int32_t index) const = 0; /// \brief Get a field by name. [[nodiscard]] virtual std::optional> GetFieldByName(std::string_view name) const = 0; @@ -212,13 +212,26 @@ class ICEBERG_EXPORT TimeType : public PrimitiveType { bool Equals(const Type& other) const override; }; +/// \brief A base class for any timestamp time (irrespective of unit or +/// timezone). +class ICEBERG_EXPORT TimestampBase : public PrimitiveType { + public: + /// \brief Is this type zoned or naive? + [[nodiscard]] virtual bool is_zoned() const = 0; + /// \brief The time resolution. + [[nodiscard]] virtual TimeUnit time_unit() const = 0; +}; + /// \brief A data type representing a timestamp in microseconds without /// reference to a timezone. -class ICEBERG_EXPORT TimestampType : public PrimitiveType { +class ICEBERG_EXPORT TimestampType : public TimestampBase { public: TimestampType() = default; ~TimestampType() = default; + bool is_zoned() const override; + TimeUnit time_unit() const override; + TypeId type_id() const override; std::string ToString() const override; @@ -226,13 +239,16 @@ class ICEBERG_EXPORT TimestampType : public PrimitiveType { bool Equals(const Type& other) const override; }; -/// \brief A data type representing a timestamp in microseconds in a -/// particular timezone. -class ICEBERG_EXPORT TimestampTzType : public PrimitiveType { +/// \brief A data type representing a timestamp as microseconds since the +/// epoch in UTC. +class ICEBERG_EXPORT TimestampTzType : public TimestampBase { public: TimestampTzType() = default; ~TimestampTzType() = default; + bool is_zoned() const override; + TimeUnit time_unit() const override; + TypeId type_id() const override; std::string ToString() const override; @@ -325,7 +341,7 @@ class ICEBERG_EXPORT ListType : public NestedType { std::optional> GetFieldById( int32_t field_id) const override; std::optional> GetFieldByIndex( - int i) const override; + int32_t index) const override; std::optional> GetFieldByName( std::string_view name) const override; @@ -356,7 +372,7 @@ class ICEBERG_EXPORT MapType : public NestedType { std::optional> GetFieldById( int32_t field_id) const override; std::optional> GetFieldByIndex( - int i) const override; + int32_t index) const override; std::optional> GetFieldByName( std::string_view name) const override; @@ -379,7 +395,7 @@ class ICEBERG_EXPORT StructType : public NestedType { std::optional> GetFieldById( int32_t field_id) const override; std::optional> GetFieldByIndex( - int i) const override; + int32_t index) const override; std::optional> GetFieldByName( std::string_view name) const override; diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index b05bb7d3f..c70a2a7d8 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -52,6 +52,11 @@ enum class TypeId { kMap, }; +/// \brief The time unit. In Iceberg V3 nanoseconds are also supported. +enum class TimeUnit { + kMicrosecond, +}; + class BooleanType; class SchemaField; class NestedType; From f51e8d36ee3b24cbfcb44c42c120b99f40b46fb3 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 22 Jan 2025 19:39:52 -0500 Subject: [PATCH 04/13] add some unit tests --- src/iceberg/type.cc | 2 +- src/iceberg/type.h | 2 - src/iceberg/type_fwd.h | 19 ++- test/core/CMakeLists.txt | 2 +- test/core/type_test.cc | 256 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 276 insertions(+), 5 deletions(-) create mode 100644 test/core/type_test.cc diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index d44d554a0..dc3235442 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -216,7 +216,7 @@ std::string MapType::ToString() const { std::string repr = "map<"; std::format_to(std::back_inserter(repr), "{}: {}", key(), value()); - repr += "}"; + repr += ">"; return repr; } std::span MapType::fields() const { return fields_; } diff --git a/src/iceberg/type.h b/src/iceberg/type.h index aba37f72b..50f3d18a9 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -408,6 +408,4 @@ class ICEBERG_EXPORT StructType : public NestedType { /// @} -// TODO: need to specialize std::format (ideally via a trait?) - } // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index c70a2a7d8..5726c8f1e 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -57,12 +57,29 @@ enum class TimeUnit { kMicrosecond, }; +class BinaryType; class BooleanType; -class SchemaField; +class DateType; +class DecimalType; +class FixedType; +class Float32Type; +class Float64Type; +class Int32Type; +class Int64Type; +class ListType; +class MapType; class NestedType; class PrimitiveType; class Schema; +class SchemaField; +class StringType; +class StructType; class StructType; +class TimeType; +class TimestampBase; +class TimestampType; +class TimestampTzType; class Type; +class UuidType; } // namespace iceberg diff --git a/test/core/CMakeLists.txt b/test/core/CMakeLists.txt index 551201779..787f5cbd1 100644 --- a/test/core/CMakeLists.txt +++ b/test/core/CMakeLists.txt @@ -16,7 +16,7 @@ # under the License. add_executable(core_unittest) -target_sources(core_unittest PRIVATE core_unittest.cc) +target_sources(core_unittest PRIVATE core_unittest.cc type_test.cc) target_link_libraries(core_unittest PRIVATE iceberg_static GTest::gtest_main) target_include_directories(core_unittest PRIVATE "${ICEBERG_INCLUDES}") add_test(NAME core_unittest COMMAND core_unittest) diff --git a/test/core/type_test.cc b/test/core/type_test.cc new file mode 100644 index 000000000..5bb2dfb07 --- /dev/null +++ b/test/core/type_test.cc @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/type.h" + +#include +#include +#include + +#include + +#include "gtest/gtest.h" +#include "iceberg/util/formatter.h" + +struct TypeTestCase { + /// Test case name, must be safe for Googletest (alphanumeric + underscore) + std::string name; + std::shared_ptr type; + iceberg::TypeId type_id; + bool primitive; + std::string repr; +}; + +std::string TypeTestCaseToString(const ::testing::TestParamInfo& info) { + return info.param.name; +} + +class TypeTest : public ::testing::TestWithParam {}; + +TEST_P(TypeTest, TypeId) { + const auto& test_case = GetParam(); + ASSERT_EQ(test_case.type_id, test_case.type->type_id()); +} + +TEST_P(TypeTest, IsPrimitive) { + const auto& test_case = GetParam(); + if (test_case.primitive) { + ASSERT_TRUE(test_case.type->is_primitive()); + ASSERT_FALSE(test_case.type->is_nested()); + + const auto* primitive = + dynamic_cast(test_case.type.get()); + ASSERT_NE(nullptr, primitive); + } +} + +TEST_P(TypeTest, IsNested) { + const auto& test_case = GetParam(); + if (!test_case.primitive) { + ASSERT_FALSE(test_case.type->is_primitive()); + ASSERT_TRUE(test_case.type->is_nested()); + + const auto* nested = dynamic_cast(test_case.type.get()); + ASSERT_NE(nullptr, nested); + } +} + +TEST_P(TypeTest, ReflexiveEquality) { + const auto& test_case = GetParam(); + ASSERT_EQ(*test_case.type, *test_case.type); +} + +TEST_P(TypeTest, ToString) { + const auto& test_case = GetParam(); + ASSERT_EQ(test_case.repr, test_case.type->ToString()); +} + +TEST_P(TypeTest, StdFormat) { + const auto& test_case = GetParam(); + ASSERT_EQ(test_case.repr, std::format("{}", *test_case.type)); +} + +const static TypeTestCase kPrimitiveTypes[] = { + { + .name = "boolean", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kBoolean, + .primitive = true, + .repr = "boolean", + }, + { + .name = "int32", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kInt32, + .primitive = true, + .repr = "int32", + }, + { + .name = "int64", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kInt64, + .primitive = true, + .repr = "int64", + }, + { + .name = "float32", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kFloat32, + .primitive = true, + .repr = "float32", + }, + { + .name = "float64", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kFloat64, + .primitive = true, + .repr = "float64", + }, + { + .name = "decimal9_2", + .type = std::make_shared(9, 2), + .type_id = iceberg::TypeId::kDecimal, + .primitive = true, + .repr = "decimal(9, 2)", + }, + { + .name = "decimal38_10", + .type = std::make_shared(38, 10), + .type_id = iceberg::TypeId::kDecimal, + .primitive = true, + .repr = "decimal(38, 10)", + }, + { + .name = "date", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kDate, + .primitive = true, + .repr = "date", + }, + { + .name = "time", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kTime, + .primitive = true, + .repr = "time", + }, + { + .name = "timestamp", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kTimestamp, + .primitive = true, + .repr = "timestamp", + }, + { + .name = "timestamptz", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kTimestampTz, + .primitive = true, + .repr = "timestamptz", + }, + { + .name = "binary", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kBinary, + .primitive = true, + .repr = "binary", + }, + { + .name = "string", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kString, + .primitive = true, + .repr = "string", + }, + { + .name = "fixed10", + .type = std::make_shared(10), + .type_id = iceberg::TypeId::kFixed, + .primitive = true, + .repr = "fixed(10)", + }, + { + .name = "fixed255", + .type = std::make_shared(255), + .type_id = iceberg::TypeId::kFixed, + .primitive = true, + .repr = "fixed(255)", + }, + { + .name = "uuid", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kUuid, + .primitive = true, + .repr = "uuid", + }, +}; + +const static TypeTestCase kNestedTypes[] = { + { + .name = "list_int", + .type = std::make_shared( + 1, std::make_shared(), true), + .type_id = iceberg::TypeId::kList, + .primitive = false, + .repr = "list", + }, + { + .name = "list_list_int", + .type = std::make_shared( + 1, + std::make_shared(2, std::make_shared(), + true), + false), + .type_id = iceberg::TypeId::kList, + .primitive = false, + .repr = "list (required)>", + }, + { + .name = "map_int_string", + .type = std::make_shared( + iceberg::SchemaField::MakeRequired(1, "key", + std::make_shared()), + iceberg::SchemaField::MakeRequired(2, "value", + std::make_shared())), + .type_id = iceberg::TypeId::kMap, + .primitive = false, + .repr = "map", + }, + { + .name = "struct", + .type = std::make_shared(std::vector{ + iceberg::SchemaField::MakeRequired(1, "foo", + std::make_shared()), + iceberg::SchemaField::MakeOptional(2, "bar", + std::make_shared()), + }), + .type_id = iceberg::TypeId::kStruct, + .primitive = false, + .repr = R"(struct< + foo (1): int64 (required) + bar (2): string +>)", + }, +}; + +INSTANTIATE_TEST_SUITE_P(Primitive, TypeTest, ::testing::ValuesIn(kPrimitiveTypes), + TypeTestCaseToString); + +INSTANTIATE_TEST_SUITE_P(Nested, TypeTest, ::testing::ValuesIn(kNestedTypes), + TypeTestCaseToString); From 91cd927fdf52f64e2bc59cab8c81a0508fc62157 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 22 Jan 2025 21:13:08 -0500 Subject: [PATCH 05/13] add more unit tests --- test/core/type_test.cc | 93 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/test/core/type_test.cc b/test/core/type_test.cc index 5bb2dfb07..d05b76e9f 100644 --- a/test/core/type_test.cc +++ b/test/core/type_test.cc @@ -21,11 +21,12 @@ #include #include +#include #include +#include #include -#include "gtest/gtest.h" #include "iceberg/util/formatter.h" struct TypeTestCase { @@ -254,3 +255,93 @@ INSTANTIATE_TEST_SUITE_P(Primitive, TypeTest, ::testing::ValuesIn(kPrimitiveType INSTANTIATE_TEST_SUITE_P(Nested, TypeTest, ::testing::ValuesIn(kNestedTypes), TypeTestCaseToString); + +TEST(TypeTest, Equality) { + std::vector> alltypes; + for (const auto& test_case : kPrimitiveTypes) { + alltypes.push_back(test_case.type); + } + for (const auto& test_case : kNestedTypes) { + alltypes.push_back(test_case.type); + } + + for (size_t i = 0; i < alltypes.size(); i++) { + for (size_t j = 0; j < alltypes.size(); j++) { + SCOPED_TRACE(std::format("{} == {}", *alltypes[i], *alltypes[j])); + + if (i == j) { + ASSERT_EQ(*alltypes[i], *alltypes[j]); + } else { + ASSERT_NE(*alltypes[i], *alltypes[j]); + } + } + } +} + +TEST(TypeTest, Decimal) { + { + iceberg::DecimalType decimal(38, 2); + ASSERT_EQ(38, decimal.precision()); + ASSERT_EQ(2, decimal.scale()); + } + { + iceberg::DecimalType decimal(10, -10); + ASSERT_EQ(10, decimal.precision()); + ASSERT_EQ(-10, decimal.scale()); + } + ASSERT_THAT([]() { iceberg::DecimalType decimal(-1, 10); }, + ::testing::ThrowsMessage( + ::testing::HasSubstr("precision must be in [0, 38], was -1"))); + + ASSERT_THAT([]() { iceberg::DecimalType decimal(39, 10); }, + ::testing::ThrowsMessage( + ::testing::HasSubstr("precision must be in [0, 38], was 39"))); +} + +TEST(TypeTest, Fixed) { + { + iceberg::FixedType fixed(0); + ASSERT_EQ(0, fixed.length()); + } + { + iceberg::FixedType fixed(1); + ASSERT_EQ(1, fixed.length()); + } + { + iceberg::FixedType fixed(127); + ASSERT_EQ(127, fixed.length()); + } + ASSERT_THAT([]() { iceberg::FixedType decimal(-1); }, + ::testing::ThrowsMessage( + ::testing::HasSubstr("length must be >= 0, was -1"))); +} + +TEST(TypeTest, List) { + { + iceberg::SchemaField field(5, "element", std::make_shared(), + true); + iceberg::ListType list(field); + std::span fields = list.fields(); + ASSERT_EQ(1, fields.size()); + ASSERT_EQ(field, fields[0]); + ASSERT_THAT(list.GetFieldById(5), ::testing::Optional(field)); + ASSERT_THAT(list.GetFieldByIndex(0), ::testing::Optional(field)); + ASSERT_THAT(list.GetFieldByName("element"), ::testing::Optional(field)); + + ASSERT_EQ(std::nullopt, list.GetFieldById(0)); + ASSERT_EQ(std::nullopt, list.GetFieldByIndex(1)); + ASSERT_EQ(std::nullopt, list.GetFieldByIndex(-1)); + ASSERT_EQ(std::nullopt, list.GetFieldByName("foo")); + } + ASSERT_THAT( + []() { + iceberg::ListType list(iceberg::SchemaField( + 1, "wrongname", std::make_shared(), true)); + }, + ::testing::ThrowsMessage( + ::testing::HasSubstr("child field name should be 'element', was 'wrongname'"))); +} + +TEST(TypeTest, Map) {} + +TEST(TypeTest, Struct) {} From 85f1ade4c3142ab5bca4486bfdb9bc2f5f423327 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 22 Jan 2025 21:23:51 -0500 Subject: [PATCH 06/13] add more unit tests --- src/iceberg/type.cc | 2 +- test/core/type_test.cc | 75 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 3 deletions(-) diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index dc3235442..2cdeaf63f 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -233,7 +233,7 @@ std::optional> MapType::GetFieldByInde int32_t index) const { if (index == 0) { return key(); - } else if (index == 0) { + } else if (index == 1) { return value(); } return std::nullopt; diff --git a/test/core/type_test.cc b/test/core/type_test.cc index d05b76e9f..7d7112021 100644 --- a/test/core/type_test.cc +++ b/test/core/type_test.cc @@ -342,6 +342,77 @@ TEST(TypeTest, List) { ::testing::HasSubstr("child field name should be 'element', was 'wrongname'"))); } -TEST(TypeTest, Map) {} +TEST(TypeTest, Map) { + { + iceberg::SchemaField key(5, "key", std::make_shared(), true); + iceberg::SchemaField value(7, "value", std::make_shared(), true); + iceberg::MapType map(key, value); + std::span fields = map.fields(); + ASSERT_EQ(2, fields.size()); + ASSERT_EQ(key, fields[0]); + ASSERT_EQ(value, fields[1]); + ASSERT_THAT(map.GetFieldById(5), ::testing::Optional(key)); + ASSERT_THAT(map.GetFieldById(7), ::testing::Optional(value)); + ASSERT_THAT(map.GetFieldByIndex(0), ::testing::Optional(key)); + ASSERT_THAT(map.GetFieldByIndex(1), ::testing::Optional(value)); + ASSERT_THAT(map.GetFieldByName("key"), ::testing::Optional(key)); + ASSERT_THAT(map.GetFieldByName("value"), ::testing::Optional(value)); + + ASSERT_EQ(std::nullopt, map.GetFieldById(0)); + ASSERT_EQ(std::nullopt, map.GetFieldByIndex(2)); + ASSERT_EQ(std::nullopt, map.GetFieldByIndex(-1)); + ASSERT_EQ(std::nullopt, map.GetFieldByName("element")); + } + ASSERT_THAT( + []() { + iceberg::SchemaField key(5, "notkey", std::make_shared(), + true); + iceberg::SchemaField value(7, "value", std::make_shared(), + true); + iceberg::MapType map(key, value); + }, + ::testing::ThrowsMessage( + ::testing::HasSubstr("key field name should be 'key', was 'notkey'"))); + ASSERT_THAT( + []() { + iceberg::SchemaField key(5, "key", std::make_shared(), true); + iceberg::SchemaField value(7, "notvalue", std::make_shared(), + true); + iceberg::MapType map(key, value); + }, + ::testing::ThrowsMessage( + ::testing::HasSubstr("value field name should be 'value', was 'notvalue'"))); +} -TEST(TypeTest, Struct) {} +TEST(TypeTest, Struct) { + { + iceberg::SchemaField field1(5, "foo", std::make_shared(), true); + iceberg::SchemaField field2(7, "bar", std::make_shared(), true); + iceberg::StructType struct_({field1, field2}); + std::span fields = struct_.fields(); + ASSERT_EQ(2, fields.size()); + ASSERT_EQ(field1, fields[0]); + ASSERT_EQ(field2, fields[1]); + ASSERT_THAT(struct_.GetFieldById(5), ::testing::Optional(field1)); + ASSERT_THAT(struct_.GetFieldById(7), ::testing::Optional(field2)); + ASSERT_THAT(struct_.GetFieldByIndex(0), ::testing::Optional(field1)); + ASSERT_THAT(struct_.GetFieldByIndex(1), ::testing::Optional(field2)); + ASSERT_THAT(struct_.GetFieldByName("foo"), ::testing::Optional(field1)); + ASSERT_THAT(struct_.GetFieldByName("bar"), ::testing::Optional(field2)); + + ASSERT_EQ(std::nullopt, struct_.GetFieldById(0)); + ASSERT_EQ(std::nullopt, struct_.GetFieldByIndex(2)); + ASSERT_EQ(std::nullopt, struct_.GetFieldByIndex(-1)); + ASSERT_EQ(std::nullopt, struct_.GetFieldByName("element")); + } + ASSERT_THAT( + []() { + iceberg::SchemaField field1(5, "foo", std::make_shared(), + true); + iceberg::SchemaField field2(5, "bar", std::make_shared(), + true); + iceberg::StructType struct_({field1, field2}); + }, + ::testing::ThrowsMessage( + ::testing::HasSubstr("duplicate field ID 5"))); +} From e609d5d8d00bed1474bb5b7f4c4c0ab0d197f3fb Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 22 Jan 2025 21:24:48 -0500 Subject: [PATCH 07/13] depend on gmock --- test/core/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/core/CMakeLists.txt b/test/core/CMakeLists.txt index 787f5cbd1..3261a0bee 100644 --- a/test/core/CMakeLists.txt +++ b/test/core/CMakeLists.txt @@ -17,6 +17,6 @@ add_executable(core_unittest) target_sources(core_unittest PRIVATE core_unittest.cc type_test.cc) -target_link_libraries(core_unittest PRIVATE iceberg_static GTest::gtest_main) +target_link_libraries(core_unittest PRIVATE iceberg_static GTest::gtest_main GTest::gmock) target_include_directories(core_unittest PRIVATE "${ICEBERG_INCLUDES}") add_test(NAME core_unittest COMMAND core_unittest) From bbc18d50620828001ac43e3408c7fee2947ceb8d Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 22 Jan 2025 21:52:19 -0500 Subject: [PATCH 08/13] add field/schema tests --- test/core/CMakeLists.txt | 3 +- test/core/schema_field_test.cc | 81 ++++++++++++++++++++++++++++++++ test/core/schema_test.cc | 85 ++++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 test/core/schema_field_test.cc create mode 100644 test/core/schema_test.cc diff --git a/test/core/CMakeLists.txt b/test/core/CMakeLists.txt index 3261a0bee..a7fba1cab 100644 --- a/test/core/CMakeLists.txt +++ b/test/core/CMakeLists.txt @@ -16,7 +16,8 @@ # under the License. add_executable(core_unittest) -target_sources(core_unittest PRIVATE core_unittest.cc type_test.cc) +target_sources(core_unittest PRIVATE core_unittest.cc schema_test.cc schema_field_test.cc + type_test.cc) target_link_libraries(core_unittest PRIVATE iceberg_static GTest::gtest_main GTest::gmock) target_include_directories(core_unittest PRIVATE "${ICEBERG_INCLUDES}") add_test(NAME core_unittest COMMAND core_unittest) diff --git a/test/core/schema_field_test.cc b/test/core/schema_field_test.cc new file mode 100644 index 000000000..69eb253ae --- /dev/null +++ b/test/core/schema_field_test.cc @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/schema_field.h" + +#include +#include + +#include + +#include "iceberg/type.h" +#include "iceberg/util/formatter.h" + +TEST(SchemaFieldTest, Basics) { + { + iceberg::SchemaField field(1, "foo", std::make_shared(), false); + EXPECT_EQ(1, field.field_id()); + EXPECT_EQ("foo", field.name()); + EXPECT_EQ(iceberg::TypeId::kInt32, field.type()->type_id()); + EXPECT_FALSE(field.optional()); + EXPECT_EQ("foo (1): int32 (required)", field.ToString()); + EXPECT_EQ("foo (1): int32 (required)", std::format("{}", field)); + } + { + iceberg::SchemaField field = iceberg::SchemaField::MakeOptional( + 2, "foo bar", std::make_shared(10)); + EXPECT_EQ(2, field.field_id()); + EXPECT_EQ("foo bar", field.name()); + EXPECT_EQ(iceberg::FixedType(10), *field.type()); + EXPECT_TRUE(field.optional()); + EXPECT_EQ("foo bar (2): fixed(10)", field.ToString()); + EXPECT_EQ("foo bar (2): fixed(10)", std::format("{}", field)); + } + { + iceberg::SchemaField field = iceberg::SchemaField::MakeRequired( + 2, "foo bar", std::make_shared(10)); + EXPECT_EQ(2, field.field_id()); + EXPECT_EQ("foo bar", field.name()); + EXPECT_EQ(iceberg::FixedType(10), *field.type()); + EXPECT_FALSE(field.optional()); + EXPECT_EQ("foo bar (2): fixed(10) (required)", field.ToString()); + EXPECT_EQ("foo bar (2): fixed(10) (required)", std::format("{}", field)); + } +} + +TEST(SchemaFieldTest, Equality) { + iceberg::SchemaField field1(1, "foo", std::make_shared(), false); + iceberg::SchemaField field2(2, "foo", std::make_shared(), false); + iceberg::SchemaField field3(1, "bar", std::make_shared(), false); + iceberg::SchemaField field4(1, "foo", std::make_shared(), false); + iceberg::SchemaField field5(1, "foo", std::make_shared(), true); + iceberg::SchemaField field6(1, "foo", std::make_shared(), false); + + ASSERT_EQ(field1, field1); + ASSERT_NE(field1, field2); + ASSERT_NE(field2, field1); + ASSERT_NE(field1, field3); + ASSERT_NE(field3, field2); + ASSERT_NE(field1, field4); + ASSERT_NE(field4, field1); + ASSERT_NE(field1, field5); + ASSERT_NE(field5, field1); + ASSERT_EQ(field1, field6); + ASSERT_EQ(field6, field1); +} diff --git a/test/core/schema_test.cc b/test/core/schema_test.cc new file mode 100644 index 000000000..aedb8c081 --- /dev/null +++ b/test/core/schema_test.cc @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/schema.h" + +#include +#include + +#include +#include + +#include "iceberg/schema_field.h" +#include "iceberg/util/formatter.h" + +TEST(SchemaTest, Basics) { + { + iceberg::SchemaField field1(5, "foo", std::make_shared(), true); + iceberg::SchemaField field2(7, "bar", std::make_shared(), true); + iceberg::Schema schema(100, {field1, field2}); + ASSERT_EQ(schema, schema); + ASSERT_EQ(100, schema.schema_id()); + std::span fields = schema.fields(); + ASSERT_EQ(2, fields.size()); + ASSERT_EQ(field1, fields[0]); + ASSERT_EQ(field2, fields[1]); + ASSERT_THAT(schema.GetFieldById(5), ::testing::Optional(field1)); + ASSERT_THAT(schema.GetFieldById(7), ::testing::Optional(field2)); + ASSERT_THAT(schema.GetFieldByIndex(0), ::testing::Optional(field1)); + ASSERT_THAT(schema.GetFieldByIndex(1), ::testing::Optional(field2)); + ASSERT_THAT(schema.GetFieldByName("foo"), ::testing::Optional(field1)); + ASSERT_THAT(schema.GetFieldByName("bar"), ::testing::Optional(field2)); + + ASSERT_EQ(std::nullopt, schema.GetFieldById(0)); + ASSERT_EQ(std::nullopt, schema.GetFieldByIndex(2)); + ASSERT_EQ(std::nullopt, schema.GetFieldByIndex(-1)); + ASSERT_EQ(std::nullopt, schema.GetFieldByName("element")); + } + ASSERT_THAT( + []() { + iceberg::SchemaField field1(5, "foo", std::make_shared(), + true); + iceberg::SchemaField field2(5, "bar", std::make_shared(), + true); + iceberg::Schema schema(100, {field1, field2}); + }, + ::testing::ThrowsMessage( + ::testing::HasSubstr("duplicate field ID 5"))); +} + +TEST(SchemaTest, Equality) { + iceberg::SchemaField field1(5, "foo", std::make_shared(), true); + iceberg::SchemaField field2(7, "bar", std::make_shared(), true); + iceberg::SchemaField field3(5, "foobar", std::make_shared(), true); + iceberg::Schema schema1(100, {field1, field2}); + iceberg::Schema schema2(101, {field1, field2}); + iceberg::Schema schema3(101, {field1}); + iceberg::Schema schema4(101, {field3, field2}); + iceberg::Schema schema5(100, {field1, field2}); + + ASSERT_EQ(schema1, schema1); + ASSERT_NE(schema1, schema2); + ASSERT_NE(schema2, schema1); + ASSERT_NE(schema1, schema3); + ASSERT_NE(schema3, schema1); + ASSERT_NE(schema1, schema4); + ASSERT_NE(schema4, schema1); + ASSERT_EQ(schema1, schema5); + ASSERT_EQ(schema5, schema1); +} From d2948b5fe5478f609ca4480d5ec324371a837176 Mon Sep 17 00:00:00 2001 From: David Li Date: Sun, 26 Jan 2025 18:58:25 -0500 Subject: [PATCH 09/13] updates --- src/iceberg/schema.h | 2 +- src/iceberg/type.cc | 34 +++++++++--------- src/iceberg/type.h | 43 ++++++++++++----------- src/iceberg/type_fwd.h | 27 +++++++-------- test/core/schema_field_test.cc | 20 +++++------ test/core/schema_test.cc | 9 +++-- test/core/type_test.cc | 63 ++++++++++++++++------------------ 7 files changed, 97 insertions(+), 101 deletions(-) diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 43ec9e16a..c58802d2f 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -44,7 +44,7 @@ class ICEBERG_EXPORT Schema : public StructType { /// \brief Get the schema ID. /// - /// Schemas are identified by a unique ID for the purposes of schema + /// A schema is identified by a unique ID for the purposes of schema /// evolution. [[nodiscard]] int32_t schema_id() const; diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index 2cdeaf63f..6d8f89001 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -33,28 +33,26 @@ bool BooleanType::Equals(const Type& other) const { return other.type_id() == TypeId::kBoolean; } -TypeId Int32Type::type_id() const { return TypeId::kInt32; } -std::string Int32Type::ToString() const { return "int32"; } -bool Int32Type::Equals(const Type& other) const { - return other.type_id() == TypeId::kInt32; -} +TypeId IntType::type_id() const { return TypeId::kInt; } +std::string IntType::ToString() const { return "int"; } +bool IntType::Equals(const Type& other) const { return other.type_id() == TypeId::kInt; } -TypeId Int64Type::type_id() const { return TypeId::kInt64; } -std::string Int64Type::ToString() const { return "int64"; } -bool Int64Type::Equals(const Type& other) const { - return other.type_id() == TypeId::kInt64; +TypeId LongType::type_id() const { return TypeId::kLong; } +std::string LongType::ToString() const { return "long"; } +bool LongType::Equals(const Type& other) const { + return other.type_id() == TypeId::kLong; } -TypeId Float32Type::type_id() const { return TypeId::kFloat32; } -std::string Float32Type::ToString() const { return "float32"; } -bool Float32Type::Equals(const Type& other) const { - return other.type_id() == TypeId::kFloat32; +TypeId FloatType::type_id() const { return TypeId::kFloat; } +std::string FloatType::ToString() const { return "float"; } +bool FloatType::Equals(const Type& other) const { + return other.type_id() == TypeId::kFloat; } -TypeId Float64Type::type_id() const { return TypeId::kFloat64; } -std::string Float64Type::ToString() const { return "float64"; } -bool Float64Type::Equals(const Type& other) const { - return other.type_id() == TypeId::kFloat64; +TypeId DoubleType::type_id() const { return TypeId::kDouble; } +std::string DoubleType::ToString() const { return "double"; } +bool DoubleType::Equals(const Type& other) const { + return other.type_id() == TypeId::kDouble; } DecimalType::DecimalType(int32_t precision, int32_t scale) @@ -265,7 +263,7 @@ StructType::StructType(std::vector fields) : fields_(std::move(fiel field.field_id(), it->second, index)); } - index++; + ++index; } } diff --git a/src/iceberg/type.h b/src/iceberg/type.h index 50f3d18a9..c6c2d93c3 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -63,14 +63,14 @@ class ICEBERG_EXPORT Type : public iceberg::util::Formattable { [[nodiscard]] virtual bool Equals(const Type& other) const = 0; }; -/// \brief A data type that may not have child fields. +/// \brief A data type that does not have child fields. class ICEBERG_EXPORT PrimitiveType : public Type { public: bool is_primitive() const override { return true; } bool is_nested() const override { return false; } }; -/// \brief A data type that may have child fields. +/// \brief A data type that has child fields. class ICEBERG_EXPORT NestedType : public Type { public: bool is_primitive() const override { return false; } @@ -93,7 +93,7 @@ class ICEBERG_EXPORT NestedType : public Type { /// Primitive types do not have nested fields. /// @{ -/// \brief A data type representing a boolean. +/// \brief A data type representing a boolean (true or false). class ICEBERG_EXPORT BooleanType : public PrimitiveType { public: BooleanType() = default; @@ -107,10 +107,10 @@ class ICEBERG_EXPORT BooleanType : public PrimitiveType { }; /// \brief A data type representing a 32-bit signed integer. -class ICEBERG_EXPORT Int32Type : public PrimitiveType { +class ICEBERG_EXPORT IntType : public PrimitiveType { public: - Int32Type() = default; - ~Int32Type() = default; + IntType() = default; + ~IntType() = default; TypeId type_id() const override; std::string ToString() const override; @@ -120,10 +120,10 @@ class ICEBERG_EXPORT Int32Type : public PrimitiveType { }; /// \brief A data type representing a 64-bit signed integer. -class ICEBERG_EXPORT Int64Type : public PrimitiveType { +class ICEBERG_EXPORT LongType : public PrimitiveType { public: - Int64Type() = default; - ~Int64Type() = default; + LongType() = default; + ~LongType() = default; TypeId type_id() const override; std::string ToString() const override; @@ -132,11 +132,12 @@ class ICEBERG_EXPORT Int64Type : public PrimitiveType { bool Equals(const Type& other) const override; }; -/// \brief A data type representing a 32-bit (single precision) float. -class ICEBERG_EXPORT Float32Type : public PrimitiveType { +/// \brief A data type representing a 32-bit (single precision) IEEE-754 +/// float. +class ICEBERG_EXPORT FloatType : public PrimitiveType { public: - Float32Type() = default; - ~Float32Type() = default; + FloatType() = default; + ~FloatType() = default; TypeId type_id() const override; std::string ToString() const override; @@ -145,11 +146,12 @@ class ICEBERG_EXPORT Float32Type : public PrimitiveType { bool Equals(const Type& other) const override; }; -/// \brief A data type representing a 64-bit (double precision) float. -class ICEBERG_EXPORT Float64Type : public PrimitiveType { +/// \brief A data type representing a 64-bit (double precision) IEEE-754 +/// float. +class ICEBERG_EXPORT DoubleType : public PrimitiveType { public: - Float64Type() = default; - ~Float64Type() = default; + DoubleType() = default; + ~DoubleType() = default; TypeId type_id() const override; std::string ToString() const override; @@ -240,7 +242,7 @@ class ICEBERG_EXPORT TimestampType : public TimestampBase { }; /// \brief A data type representing a timestamp as microseconds since the -/// epoch in UTC. +/// epoch in UTC. A time zone or offset is not stored. class ICEBERG_EXPORT TimestampTzType : public TimestampBase { public: TimestampTzType() = default; @@ -256,7 +258,7 @@ class ICEBERG_EXPORT TimestampTzType : public TimestampBase { bool Equals(const Type& other) const override; }; -/// \brief A data type representing a bytestring. +/// \brief A data type representing an arbitrary-length byte sequence. class ICEBERG_EXPORT BinaryType : public PrimitiveType { public: BinaryType() = default; @@ -269,7 +271,8 @@ class ICEBERG_EXPORT BinaryType : public PrimitiveType { bool Equals(const Type& other) const override; }; -/// \brief A data type representing a string. +/// \brief A data type representing an arbitrary-length character sequence +/// (encoded in UTF-8). class ICEBERG_EXPORT StringType : public PrimitiveType { public: StringType() = default; diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 5726c8f1e..89043938b 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -33,23 +33,23 @@ namespace iceberg { /// /// Iceberg V3 types are not currently supported. enum class TypeId { + kStruct, + kList, + kMap, kBoolean, - kInt32, - kInt64, - kFloat32, - kFloat64, + kInt, + kLong, + kFloat, + kDouble, kDecimal, kDate, kTime, kTimestamp, kTimestampTz, - kBinary, kString, - kFixed, kUuid, - kStruct, - kList, - kMap, + kFixed, + kBinary, }; /// \brief The time unit. In Iceberg V3 nanoseconds are also supported. @@ -62,10 +62,10 @@ class BooleanType; class DateType; class DecimalType; class FixedType; -class Float32Type; -class Float64Type; -class Int32Type; -class Int64Type; +class FloatType; +class DoubleType; +class IntType; +class LongType; class ListType; class MapType; class NestedType; @@ -74,7 +74,6 @@ class Schema; class SchemaField; class StringType; class StructType; -class StructType; class TimeType; class TimestampBase; class TimestampType; diff --git a/test/core/schema_field_test.cc b/test/core/schema_field_test.cc index 69eb253ae..d5fc63390 100644 --- a/test/core/schema_field_test.cc +++ b/test/core/schema_field_test.cc @@ -29,13 +29,13 @@ TEST(SchemaFieldTest, Basics) { { - iceberg::SchemaField field(1, "foo", std::make_shared(), false); + iceberg::SchemaField field(1, "foo", std::make_shared(), false); EXPECT_EQ(1, field.field_id()); EXPECT_EQ("foo", field.name()); - EXPECT_EQ(iceberg::TypeId::kInt32, field.type()->type_id()); + EXPECT_EQ(iceberg::TypeId::kInt, field.type()->type_id()); EXPECT_FALSE(field.optional()); - EXPECT_EQ("foo (1): int32 (required)", field.ToString()); - EXPECT_EQ("foo (1): int32 (required)", std::format("{}", field)); + EXPECT_EQ("foo (1): int (required)", field.ToString()); + EXPECT_EQ("foo (1): int (required)", std::format("{}", field)); } { iceberg::SchemaField field = iceberg::SchemaField::MakeOptional( @@ -60,12 +60,12 @@ TEST(SchemaFieldTest, Basics) { } TEST(SchemaFieldTest, Equality) { - iceberg::SchemaField field1(1, "foo", std::make_shared(), false); - iceberg::SchemaField field2(2, "foo", std::make_shared(), false); - iceberg::SchemaField field3(1, "bar", std::make_shared(), false); - iceberg::SchemaField field4(1, "foo", std::make_shared(), false); - iceberg::SchemaField field5(1, "foo", std::make_shared(), true); - iceberg::SchemaField field6(1, "foo", std::make_shared(), false); + iceberg::SchemaField field1(1, "foo", std::make_shared(), false); + iceberg::SchemaField field2(2, "foo", std::make_shared(), false); + iceberg::SchemaField field3(1, "bar", std::make_shared(), false); + iceberg::SchemaField field4(1, "foo", std::make_shared(), false); + iceberg::SchemaField field5(1, "foo", std::make_shared(), true); + iceberg::SchemaField field6(1, "foo", std::make_shared(), false); ASSERT_EQ(field1, field1); ASSERT_NE(field1, field2); diff --git a/test/core/schema_test.cc b/test/core/schema_test.cc index aedb8c081..43401947e 100644 --- a/test/core/schema_test.cc +++ b/test/core/schema_test.cc @@ -30,7 +30,7 @@ TEST(SchemaTest, Basics) { { - iceberg::SchemaField field1(5, "foo", std::make_shared(), true); + iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(7, "bar", std::make_shared(), true); iceberg::Schema schema(100, {field1, field2}); ASSERT_EQ(schema, schema); @@ -53,8 +53,7 @@ TEST(SchemaTest, Basics) { } ASSERT_THAT( []() { - iceberg::SchemaField field1(5, "foo", std::make_shared(), - true); + iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(5, "bar", std::make_shared(), true); iceberg::Schema schema(100, {field1, field2}); @@ -64,9 +63,9 @@ TEST(SchemaTest, Basics) { } TEST(SchemaTest, Equality) { - iceberg::SchemaField field1(5, "foo", std::make_shared(), true); + iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(7, "bar", std::make_shared(), true); - iceberg::SchemaField field3(5, "foobar", std::make_shared(), true); + iceberg::SchemaField field3(5, "foobar", std::make_shared(), true); iceberg::Schema schema1(100, {field1, field2}); iceberg::Schema schema2(101, {field1, field2}); iceberg::Schema schema3(101, {field1}); diff --git a/test/core/type_test.cc b/test/core/type_test.cc index 7d7112021..3a4b8d70d 100644 --- a/test/core/type_test.cc +++ b/test/core/type_test.cc @@ -96,32 +96,32 @@ const static TypeTestCase kPrimitiveTypes[] = { .repr = "boolean", }, { - .name = "int32", - .type = std::make_shared(), - .type_id = iceberg::TypeId::kInt32, + .name = "int", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kInt, .primitive = true, - .repr = "int32", + .repr = "int", }, { - .name = "int64", - .type = std::make_shared(), - .type_id = iceberg::TypeId::kInt64, + .name = "long", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kLong, .primitive = true, - .repr = "int64", + .repr = "long", }, { - .name = "float32", - .type = std::make_shared(), - .type_id = iceberg::TypeId::kFloat32, + .name = "float", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kFloat, .primitive = true, - .repr = "float32", + .repr = "float", }, { - .name = "float64", - .type = std::make_shared(), - .type_id = iceberg::TypeId::kFloat64, + .name = "double", + .type = std::make_shared(), + .type_id = iceberg::TypeId::kDouble, .primitive = true, - .repr = "float64", + .repr = "double", }, { .name = "decimal9_2", @@ -206,45 +206,45 @@ const static TypeTestCase kNestedTypes[] = { { .name = "list_int", .type = std::make_shared( - 1, std::make_shared(), true), + 1, std::make_shared(), true), .type_id = iceberg::TypeId::kList, .primitive = false, - .repr = "list", + .repr = "list", }, { .name = "list_list_int", .type = std::make_shared( 1, - std::make_shared(2, std::make_shared(), + std::make_shared(2, std::make_shared(), true), false), .type_id = iceberg::TypeId::kList, .primitive = false, - .repr = "list (required)>", + .repr = "list (required)>", }, { .name = "map_int_string", .type = std::make_shared( iceberg::SchemaField::MakeRequired(1, "key", - std::make_shared()), + std::make_shared()), iceberg::SchemaField::MakeRequired(2, "value", std::make_shared())), .type_id = iceberg::TypeId::kMap, .primitive = false, - .repr = "map", + .repr = "map", }, { .name = "struct", .type = std::make_shared(std::vector{ iceberg::SchemaField::MakeRequired(1, "foo", - std::make_shared()), + std::make_shared()), iceberg::SchemaField::MakeOptional(2, "bar", std::make_shared()), }), .type_id = iceberg::TypeId::kStruct, .primitive = false, .repr = R"(struct< - foo (1): int64 (required) + foo (1): long (required) bar (2): string >)", }, @@ -318,8 +318,7 @@ TEST(TypeTest, Fixed) { TEST(TypeTest, List) { { - iceberg::SchemaField field(5, "element", std::make_shared(), - true); + iceberg::SchemaField field(5, "element", std::make_shared(), true); iceberg::ListType list(field); std::span fields = list.fields(); ASSERT_EQ(1, fields.size()); @@ -344,7 +343,7 @@ TEST(TypeTest, List) { TEST(TypeTest, Map) { { - iceberg::SchemaField key(5, "key", std::make_shared(), true); + iceberg::SchemaField key(5, "key", std::make_shared(), true); iceberg::SchemaField value(7, "value", std::make_shared(), true); iceberg::MapType map(key, value); std::span fields = map.fields(); @@ -365,8 +364,7 @@ TEST(TypeTest, Map) { } ASSERT_THAT( []() { - iceberg::SchemaField key(5, "notkey", std::make_shared(), - true); + iceberg::SchemaField key(5, "notkey", std::make_shared(), true); iceberg::SchemaField value(7, "value", std::make_shared(), true); iceberg::MapType map(key, value); @@ -375,7 +373,7 @@ TEST(TypeTest, Map) { ::testing::HasSubstr("key field name should be 'key', was 'notkey'"))); ASSERT_THAT( []() { - iceberg::SchemaField key(5, "key", std::make_shared(), true); + iceberg::SchemaField key(5, "key", std::make_shared(), true); iceberg::SchemaField value(7, "notvalue", std::make_shared(), true); iceberg::MapType map(key, value); @@ -386,7 +384,7 @@ TEST(TypeTest, Map) { TEST(TypeTest, Struct) { { - iceberg::SchemaField field1(5, "foo", std::make_shared(), true); + iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(7, "bar", std::make_shared(), true); iceberg::StructType struct_({field1, field2}); std::span fields = struct_.fields(); @@ -407,8 +405,7 @@ TEST(TypeTest, Struct) { } ASSERT_THAT( []() { - iceberg::SchemaField field1(5, "foo", std::make_shared(), - true); + iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(5, "bar", std::make_shared(), true); iceberg::StructType struct_({field1, field2}); From 22f734d9304543b39305a5dab81da5404e1fdee3 Mon Sep 17 00:00:00 2001 From: David Li Date: Sun, 26 Jan 2025 19:01:29 -0500 Subject: [PATCH 10/13] updates --- src/iceberg/type.cc | 284 ++++++++++++++++++++++---------------------- src/iceberg/type.h | 180 ++++++++++++++-------------- 2 files changed, 232 insertions(+), 232 deletions(-) diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index 6d8f89001..47b7e36c4 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -27,118 +27,60 @@ namespace iceberg { -TypeId BooleanType::type_id() const { return TypeId::kBoolean; } -std::string BooleanType::ToString() const { return "boolean"; } -bool BooleanType::Equals(const Type& other) const { - return other.type_id() == TypeId::kBoolean; -} - -TypeId IntType::type_id() const { return TypeId::kInt; } -std::string IntType::ToString() const { return "int"; } -bool IntType::Equals(const Type& other) const { return other.type_id() == TypeId::kInt; } - -TypeId LongType::type_id() const { return TypeId::kLong; } -std::string LongType::ToString() const { return "long"; } -bool LongType::Equals(const Type& other) const { - return other.type_id() == TypeId::kLong; -} - -TypeId FloatType::type_id() const { return TypeId::kFloat; } -std::string FloatType::ToString() const { return "float"; } -bool FloatType::Equals(const Type& other) const { - return other.type_id() == TypeId::kFloat; -} - -TypeId DoubleType::type_id() const { return TypeId::kDouble; } -std::string DoubleType::ToString() const { return "double"; } -bool DoubleType::Equals(const Type& other) const { - return other.type_id() == TypeId::kDouble; -} +StructType::StructType(std::vector fields) : fields_(std::move(fields)) { + size_t index = 0; + for (const auto& field : fields_) { + auto [it, inserted] = field_id_to_index_.try_emplace(field.field_id(), index); + if (!inserted) { + throw std::runtime_error( + std::format("StructType: duplicate field ID {} (field indices {} and {})", + field.field_id(), it->second, index)); + } -DecimalType::DecimalType(int32_t precision, int32_t scale) - : precision_(precision), scale_(scale) { - if (precision < 0 || precision > kMaxPrecision) { - throw std::runtime_error( - std::format("DecimalType: precision must be in [0, 38], was {}", precision)); + ++index; } } -int32_t DecimalType::precision() const { return precision_; } -int32_t DecimalType::scale() const { return scale_; } -TypeId DecimalType::type_id() const { return TypeId::kDecimal; } -std::string DecimalType::ToString() const { - return std::format("decimal({}, {})", precision_, scale_); -} -bool DecimalType::Equals(const Type& other) const { - if (other.type_id() != TypeId::kDecimal) { - return false; +TypeId StructType::type_id() const { return TypeId::kStruct; } +std::string StructType::ToString() const { + std::string repr = "struct<\n"; + for (const auto& field : fields_) { + std::format_to(std::back_inserter(repr), " {}\n", field); } - const auto& decimal = static_cast(other); - return precision_ == decimal.precision_ && scale_ == decimal.scale_; -} - -TypeId TimeType::type_id() const { return TypeId::kTime; } -std::string TimeType::ToString() const { return "time"; } -bool TimeType::Equals(const Type& other) const { - return other.type_id() == TypeId::kTime; -} - -TypeId DateType::type_id() const { return TypeId::kDate; } -std::string DateType::ToString() const { return "date"; } -bool DateType::Equals(const Type& other) const { - return other.type_id() == TypeId::kDate; -} - -bool TimestampType::is_zoned() const { return false; } -TimeUnit TimestampType::time_unit() const { return TimeUnit::kMicrosecond; } -TypeId TimestampType::type_id() const { return TypeId::kTimestamp; } -std::string TimestampType::ToString() const { return "timestamp"; } -bool TimestampType::Equals(const Type& other) const { - return other.type_id() == TypeId::kTimestamp; -} - -bool TimestampTzType::is_zoned() const { return true; } -TimeUnit TimestampTzType::time_unit() const { return TimeUnit::kMicrosecond; } -TypeId TimestampTzType::type_id() const { return TypeId::kTimestampTz; } -std::string TimestampTzType::ToString() const { return "timestamptz"; } -bool TimestampTzType::Equals(const Type& other) const { - return other.type_id() == TypeId::kTimestampTz; + repr += ">"; + return repr; } - -TypeId BinaryType::type_id() const { return TypeId::kBinary; } -std::string BinaryType::ToString() const { return "binary"; } -bool BinaryType::Equals(const Type& other) const { - return other.type_id() == TypeId::kBinary; +std::span StructType::fields() const { return fields_; } +std::optional> StructType::GetFieldById( + int32_t field_id) const { + auto it = field_id_to_index_.find(field_id); + if (it == field_id_to_index_.end()) return std::nullopt; + return fields_[it->second]; } - -TypeId StringType::type_id() const { return TypeId::kString; } -std::string StringType::ToString() const { return "string"; } -bool StringType::Equals(const Type& other) const { - return other.type_id() == TypeId::kString; +std::optional> StructType::GetFieldByIndex( + int32_t index) const { + if (index < 0 || index >= static_cast(fields_.size())) { + return std::nullopt; + } + return fields_[index]; } - -FixedType::FixedType(int32_t length) : length_(length) { - if (length < 0) { - throw std::runtime_error( - std::format("FixedType: length must be >= 0, was {}", length)); +std::optional> StructType::GetFieldByName( + std::string_view name) const { + // TODO: what is the right behavior if there are duplicate names? (Are + // duplicate names permitted?) + for (const auto& field : fields_) { + if (field.name() == name) { + return field; + } } + return std::nullopt; } - -int32_t FixedType::length() const { return length_; } -TypeId FixedType::type_id() const { return TypeId::kFixed; } -std::string FixedType::ToString() const { return std::format("fixed({})", length_); } -bool FixedType::Equals(const Type& other) const { - if (other.type_id() != TypeId::kFixed) { +bool StructType::Equals(const Type& other) const { + if (other.type_id() != TypeId::kStruct) { return false; } - const auto& fixed = static_cast(other); - return length_ == fixed.length_; -} - -TypeId UuidType::type_id() const { return TypeId::kUuid; } -std::string UuidType::ToString() const { return "uuid"; } -bool UuidType::Equals(const Type& other) const { - return other.type_id() == TypeId::kUuid; + const auto& struct_ = static_cast(other); + return fields_ == struct_.fields_; } ListType::ListType(SchemaField element) : element_(std::move(element)) { @@ -253,60 +195,118 @@ bool MapType::Equals(const Type& other) const { return fields_ == map.fields_; } -StructType::StructType(std::vector fields) : fields_(std::move(fields)) { - size_t index = 0; - for (const auto& field : fields_) { - auto [it, inserted] = field_id_to_index_.try_emplace(field.field_id(), index); - if (!inserted) { - throw std::runtime_error( - std::format("StructType: duplicate field ID {} (field indices {} and {})", - field.field_id(), it->second, index)); - } +TypeId BooleanType::type_id() const { return TypeId::kBoolean; } +std::string BooleanType::ToString() const { return "boolean"; } +bool BooleanType::Equals(const Type& other) const { + return other.type_id() == TypeId::kBoolean; +} - ++index; - } +TypeId IntType::type_id() const { return TypeId::kInt; } +std::string IntType::ToString() const { return "int"; } +bool IntType::Equals(const Type& other) const { return other.type_id() == TypeId::kInt; } + +TypeId LongType::type_id() const { return TypeId::kLong; } +std::string LongType::ToString() const { return "long"; } +bool LongType::Equals(const Type& other) const { + return other.type_id() == TypeId::kLong; } -TypeId StructType::type_id() const { return TypeId::kStruct; } -std::string StructType::ToString() const { - std::string repr = "struct<\n"; - for (const auto& field : fields_) { - std::format_to(std::back_inserter(repr), " {}\n", field); +TypeId FloatType::type_id() const { return TypeId::kFloat; } +std::string FloatType::ToString() const { return "float"; } +bool FloatType::Equals(const Type& other) const { + return other.type_id() == TypeId::kFloat; +} + +TypeId DoubleType::type_id() const { return TypeId::kDouble; } +std::string DoubleType::ToString() const { return "double"; } +bool DoubleType::Equals(const Type& other) const { + return other.type_id() == TypeId::kDouble; +} + +DecimalType::DecimalType(int32_t precision, int32_t scale) + : precision_(precision), scale_(scale) { + if (precision < 0 || precision > kMaxPrecision) { + throw std::runtime_error( + std::format("DecimalType: precision must be in [0, 38], was {}", precision)); } - repr += ">"; - return repr; } -std::span StructType::fields() const { return fields_; } -std::optional> StructType::GetFieldById( - int32_t field_id) const { - auto it = field_id_to_index_.find(field_id); - if (it == field_id_to_index_.end()) return std::nullopt; - return fields_[it->second]; + +int32_t DecimalType::precision() const { return precision_; } +int32_t DecimalType::scale() const { return scale_; } +TypeId DecimalType::type_id() const { return TypeId::kDecimal; } +std::string DecimalType::ToString() const { + return std::format("decimal({}, {})", precision_, scale_); } -std::optional> StructType::GetFieldByIndex( - int32_t index) const { - if (index < 0 || index >= static_cast(fields_.size())) { - return std::nullopt; +bool DecimalType::Equals(const Type& other) const { + if (other.type_id() != TypeId::kDecimal) { + return false; } - return fields_[index]; + const auto& decimal = static_cast(other); + return precision_ == decimal.precision_ && scale_ == decimal.scale_; } -std::optional> StructType::GetFieldByName( - std::string_view name) const { - // TODO: what is the right behavior if there are duplicate names? (Are - // duplicate names permitted?) - for (const auto& field : fields_) { - if (field.name() == name) { - return field; - } + +TypeId DateType::type_id() const { return TypeId::kDate; } +std::string DateType::ToString() const { return "date"; } +bool DateType::Equals(const Type& other) const { + return other.type_id() == TypeId::kDate; +} + +TypeId TimeType::type_id() const { return TypeId::kTime; } +std::string TimeType::ToString() const { return "time"; } +bool TimeType::Equals(const Type& other) const { + return other.type_id() == TypeId::kTime; +} + +bool TimestampType::is_zoned() const { return false; } +TimeUnit TimestampType::time_unit() const { return TimeUnit::kMicrosecond; } +TypeId TimestampType::type_id() const { return TypeId::kTimestamp; } +std::string TimestampType::ToString() const { return "timestamp"; } +bool TimestampType::Equals(const Type& other) const { + return other.type_id() == TypeId::kTimestamp; +} + +bool TimestampTzType::is_zoned() const { return true; } +TimeUnit TimestampTzType::time_unit() const { return TimeUnit::kMicrosecond; } +TypeId TimestampTzType::type_id() const { return TypeId::kTimestampTz; } +std::string TimestampTzType::ToString() const { return "timestamptz"; } +bool TimestampTzType::Equals(const Type& other) const { + return other.type_id() == TypeId::kTimestampTz; +} + +TypeId StringType::type_id() const { return TypeId::kString; } +std::string StringType::ToString() const { return "string"; } +bool StringType::Equals(const Type& other) const { + return other.type_id() == TypeId::kString; +} + +TypeId UuidType::type_id() const { return TypeId::kUuid; } +std::string UuidType::ToString() const { return "uuid"; } +bool UuidType::Equals(const Type& other) const { + return other.type_id() == TypeId::kUuid; +} + +FixedType::FixedType(int32_t length) : length_(length) { + if (length < 0) { + throw std::runtime_error( + std::format("FixedType: length must be >= 0, was {}", length)); } - return std::nullopt; } -bool StructType::Equals(const Type& other) const { - if (other.type_id() != TypeId::kStruct) { + +int32_t FixedType::length() const { return length_; } +TypeId FixedType::type_id() const { return TypeId::kFixed; } +std::string FixedType::ToString() const { return std::format("fixed({})", length_); } +bool FixedType::Equals(const Type& other) const { + if (other.type_id() != TypeId::kFixed) { return false; } - const auto& struct_ = static_cast(other); - return fields_ == struct_.fields_; + const auto& fixed = static_cast(other); + return length_ == fixed.length_; +} + +TypeId BinaryType::type_id() const { return TypeId::kBinary; } +std::string BinaryType::ToString() const { return "binary"; } +bool BinaryType::Equals(const Type& other) const { + return other.type_id() == TypeId::kBinary; } } // namespace iceberg diff --git a/src/iceberg/type.h b/src/iceberg/type.h index c6c2d93c3..d4fe4e019 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -89,6 +89,96 @@ class ICEBERG_EXPORT NestedType : public Type { GetFieldByName(std::string_view name) const = 0; }; +/// \defgroup type-nested Nested Types +/// Nested types have child fields. +/// @{ + +/// \brief A data type representing a struct with nested fields. +class ICEBERG_EXPORT StructType : public NestedType { + public: + explicit StructType(std::vector fields); + ~StructType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + std::span fields() const override; + std::optional> GetFieldById( + int32_t field_id) const override; + std::optional> GetFieldByIndex( + int32_t index) const override; + std::optional> GetFieldByName( + std::string_view name) const override; + + protected: + bool Equals(const Type& other) const override; + + std::vector fields_; + std::unordered_map field_id_to_index_; +}; + +/// \brief A data type representing a list of values. +class ICEBERG_EXPORT ListType : public NestedType { + public: + constexpr static const std::string_view kElementName = "element"; + + /// \brief Construct a list of the given element. The name of the child + /// field should be "element". + explicit ListType(SchemaField element); + /// \brief Construct a list of the given element type. + ListType(int32_t field_id, std::shared_ptr type, bool optional); + ~ListType() = default; + + TypeId type_id() const override; + std::string ToString() const override; + + std::span fields() const override; + std::optional> GetFieldById( + int32_t field_id) const override; + std::optional> GetFieldByIndex( + int32_t index) const override; + std::optional> GetFieldByName( + std::string_view name) const override; + + protected: + bool Equals(const Type& other) const override; + + SchemaField element_; +}; + +/// \brief A data type representing a dictionary of values. +class ICEBERG_EXPORT MapType : public NestedType { + public: + constexpr static const std::string_view kKeyName = "key"; + constexpr static const std::string_view kValueName = "value"; + + /// \brief Construct a map of the given key/value fields. The field names + /// should be "key" and "value", respectively. + explicit MapType(SchemaField key, SchemaField value); + ~MapType() = default; + + const SchemaField& key() const; + const SchemaField& value() const; + + TypeId type_id() const override; + std::string ToString() const override; + + std::span fields() const override; + std::optional> GetFieldById( + int32_t field_id) const override; + std::optional> GetFieldByIndex( + int32_t index) const override; + std::optional> GetFieldByName( + std::string_view name) const override; + + protected: + bool Equals(const Type& other) const override; + + std::array fields_; +}; + +/// @} + /// \defgroup type-primitive Primitive Types /// Primitive types do not have nested fields. /// @{ @@ -321,94 +411,4 @@ class ICEBERG_EXPORT UuidType : public PrimitiveType { /// @} -/// \defgroup type-nested Nested Types -/// Nested types have nested fields. -/// @{ - -/// \brief A data type representing a list of values. -class ICEBERG_EXPORT ListType : public NestedType { - public: - constexpr static const std::string_view kElementName = "element"; - - /// \brief Construct a list of the given element. The name of the child - /// field should be "element". - explicit ListType(SchemaField element); - /// \brief Construct a list of the given element type. - ListType(int32_t field_id, std::shared_ptr type, bool optional); - ~ListType() = default; - - TypeId type_id() const override; - std::string ToString() const override; - - std::span fields() const override; - std::optional> GetFieldById( - int32_t field_id) const override; - std::optional> GetFieldByIndex( - int32_t index) const override; - std::optional> GetFieldByName( - std::string_view name) const override; - - protected: - bool Equals(const Type& other) const override; - - SchemaField element_; -}; - -/// \brief A data type representing a dictionary of values. -class ICEBERG_EXPORT MapType : public NestedType { - public: - constexpr static const std::string_view kKeyName = "key"; - constexpr static const std::string_view kValueName = "value"; - - /// \brief Construct a map of the given key/value fields. The field names - /// should be "key" and "value", respectively. - explicit MapType(SchemaField key, SchemaField value); - ~MapType() = default; - - const SchemaField& key() const; - const SchemaField& value() const; - - TypeId type_id() const override; - std::string ToString() const override; - - std::span fields() const override; - std::optional> GetFieldById( - int32_t field_id) const override; - std::optional> GetFieldByIndex( - int32_t index) const override; - std::optional> GetFieldByName( - std::string_view name) const override; - - protected: - bool Equals(const Type& other) const override; - - std::array fields_; -}; - -/// \brief A data type representing a struct with nested fields. -class ICEBERG_EXPORT StructType : public NestedType { - public: - explicit StructType(std::vector fields); - ~StructType() = default; - - TypeId type_id() const override; - std::string ToString() const override; - - std::span fields() const override; - std::optional> GetFieldById( - int32_t field_id) const override; - std::optional> GetFieldByIndex( - int32_t index) const override; - std::optional> GetFieldByName( - std::string_view name) const override; - - protected: - bool Equals(const Type& other) const override; - - std::vector fields_; - std::unordered_map field_id_to_index_; -}; - -/// @} - } // namespace iceberg From 285f7d64b7a69e0212a5ad78bead85b45678f862 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 27 Jan 2025 08:21:23 -0500 Subject: [PATCH 11/13] nit --- src/iceberg/type.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/iceberg/type.h b/src/iceberg/type.h index d4fe4e019..3bff4bb12 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -84,7 +84,9 @@ class ICEBERG_EXPORT NestedType : public Type { /// \brief Get a field by index. [[nodiscard]] virtual std::optional> GetFieldByIndex(int32_t index) const = 0; - /// \brief Get a field by name. + /// \brief Get a field by name (case-sensitive). Behavior is undefined if + /// the field name is not unique; prefer GetFieldById or GetFieldByIndex + /// when possible. [[nodiscard]] virtual std::optional> GetFieldByName(std::string_view name) const = 0; }; From 35b73e431daff3922f21b3f174d887322bcd49de Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 27 Jan 2025 08:28:20 -0500 Subject: [PATCH 12/13] nit --- src/iceberg/util/formatter.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/iceberg/util/formatter.h b/src/iceberg/util/formatter.h index ed1c5cbaa..d9a67415f 100644 --- a/src/iceberg/util/formatter.h +++ b/src/iceberg/util/formatter.h @@ -23,11 +23,10 @@ /// A specialization of std::formatter for Formattable objects. This header /// is separate from iceberg/util/formattable.h so that the latter (which is /// meant to be included widely) does not leak unnecessarily into -/// other headers. +/// other headers. You must include this header to format a Formattable. #include #include -#include #include #include "iceberg/util/formattable.h" From e4a54131e438474ae3c0c41596e310ac2c1bc74b Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 27 Jan 2025 08:29:30 -0500 Subject: [PATCH 13/13] nit --- src/iceberg/schema.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 12874d68d..52a57db7a 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -22,7 +22,7 @@ #include #include "iceberg/type.h" -#include "iceberg/util/formatter.h" +#include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg {