Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ INLINE_INHERITED_MEMB = NO
# shortest path that makes the file name unique will be used
# The default value is: YES.

FULL_PATH_NAMES = NO
FULL_PATH_NAMES = YES

# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
# Stripping is only done if one of the specified strings matches the left-hand
Expand All @@ -207,7 +207,7 @@ FULL_PATH_NAMES = NO
# will be relative from the directory where doxygen is started.
# This tag requires that the tag FULL_PATH_NAMES is set to YES.

STRIP_FROM_PATH =
STRIP_FROM_PATH = ../src

# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
# path mentioned in the documentation of a class, which tells the reader which
Expand Down
2 changes: 1 addition & 1 deletion src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.

set(ICEBERG_SOURCES demo_table.cc)
set(ICEBERG_SOURCES demo_table.cc schema.cc schema_field.cc type.cc)

add_iceberg_lib(iceberg
SOURCES
Expand Down
47 changes: 47 additions & 0 deletions src/iceberg/schema.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/schema.h"

#include <format>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is redundant.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it good practice to IWYU?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not that familiar with IWYU yet...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We use std::format in the .cc file so we should include format, even if other headers happen to include it already


#include "iceberg/type.h"
#include "iceberg/util/formatter.h"

namespace iceberg {

Schema::Schema(int32_t schema_id, std::vector<SchemaField> fields)
: StructType(std::move(fields)), schema_id_(schema_id) {}

int32_t Schema::schema_id() const { return schema_id_; }

std::string Schema::ToString() const {
std::string repr = "schema<";
for (const auto& field : fields_) {
std::format_to(std::back_inserter(repr), " {}\n", field);
}
repr += ">";
return repr;
}

bool Schema::Equals(const Schema& other) const {
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
}

} // namespace iceberg
64 changes: 64 additions & 0 deletions src/iceberg/schema.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

/// \file iceberg/schema.h
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not familiar with this style of commenting. Could you please explain me why including the file name into a comment needed here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not done with it yet, this is for Doxygen which will generate a page to document the header.

/// Schemas for Iceberg tables. This header contains the definition of Schema
/// and any utility functions. See iceberg/type.h and iceberg/field.h as well.

#include <cstdint>
#include <string>
#include <vector>

#include "iceberg/iceberg_export.h"
#include "iceberg/schema_field.h"
#include "iceberg/type.h"

namespace iceberg {

/// \brief A schema for a Table.
///
/// A schema is a list of typed columns, along with a unique integer ID. A
/// Table may have different schemas over its lifetime due to schema
/// evolution.
class ICEBERG_EXPORT Schema : public StructType {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

: public StructType
I don't think that Schema should derive from StructType. I get that it will hold a list of Fields such as StructType, but still, Schema in general is not a Type, would be pretty unnatural to derive from it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Note that various type systems do actually just represent a schema as a struct type, including nanoarrow/Arrow C Data Interface. At the very least they are isomorphic and you can see methods that convert between them often. That said I don't feel strongly about this.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also inclined to inherit Schema from StructType. In practice this inheritance avoid an unnecessary conversion from Schema to StructType to implement type visitors.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious: Could visitors work with Schema if we had a Schema::asStructType or such? Doesn't seem a big deal to do that conversion for me and then the class hierarchy would be more intuitive in my opinion. Or I miss something :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would also work, it would just add the conversion, and then duplicating all the methods. No big deal either way to me at least.

(I suppose the OOP approach is to add a StructLike interface that both Struct and Schema implement.)

public:
Schema(int32_t schema_id, std::vector<SchemaField> fields);

/// \brief Get the schema ID.
///
/// Schemas are identified by a unique ID for the purposes of schema
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: should this be A Schema is identified by a unique ID or Schemas are identified by unique IDs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to "A schema..."

/// evolution.
[[nodiscard]] int32_t schema_id() const;
Copy link
Collaborator

@gaborkaszab gaborkaszab Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The functions within this class may not follow the same naming convention: See schema_id vs ToString, Equals
I found the same mismatch below too, so I'm wondering if it's me missing something here.

Update: I've just checked the style guide, and I guess this is fine: "accessors and mutators may be named like variables"
https://google.github.io/styleguide/cppguide.html#Function_Names

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Google style has trivial accessors using snake_case.


[[nodiscard]] std::string ToString() const;

friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }

friend bool operator!=(const Schema& lhs, const Schema& rhs) { return !(lhs == rhs); }

private:
/// \brief Compare two schemas for equality.
[[nodiscard]] bool Equals(const Schema& other) const;

const int32_t schema_id_;
};

} // namespace iceberg
64 changes: 64 additions & 0 deletions src/iceberg/schema_field.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/schema_field.h"

#include <format>

#include "iceberg/type.h"
#include "iceberg/util/formatter.h"

namespace iceberg {

SchemaField::SchemaField(int32_t field_id, std::string name, std::shared_ptr<Type> type,
bool optional)
: field_id_(field_id),
name_(std::move(name)),
type_(std::move(type)),
optional_(optional) {}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering why python moved to required instead of optional here:
https://github.com/apache/iceberg-python/blob/main/pyiceberg/types.py#L308
The change from optional to required seemed intentional here, @Fokko any thoughts?
BTW it seems Java went for optional too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is because the spec serializes it as required: https://iceberg.apache.org/spec/#schemas This way it is more straightforward to serialize it using the library that we use (Pydantic). See the original issue: apache/iceberg#4985. In PyIceberg we have a property called optional as well, so you can choose.


SchemaField SchemaField::MakeOptional(int32_t field_id, std::string name,
std::shared_ptr<Type> type) {
return SchemaField(field_id, std::move(name), std::move(type), true);
}

SchemaField SchemaField::MakeRequired(int32_t field_id, std::string name,
std::shared_ptr<Type> type) {
return SchemaField(field_id, std::move(name), std::move(type), false);
}

int32_t SchemaField::field_id() const { return field_id_; }

std::string_view SchemaField::name() const { return name_; }

const std::shared_ptr<Type>& SchemaField::type() const { return type_; }

bool SchemaField::optional() const { return optional_; }

std::string SchemaField::ToString() const {
return std::format("{} ({}): {}{}", name_, field_id_, *type_,
optional_ ? "" : " (required)");
Comment on lines +55 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it is just me, but I'm not a big fan of the implied (optional)

}

bool SchemaField::Equals(const SchemaField& other) const {
return field_id_ == other.field_id_ && name_ == other.name_ && *type_ == *other.type_ &&
optional_ == other.optional_;
}

} // namespace iceberg
87 changes: 87 additions & 0 deletions src/iceberg/schema_field.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

/// \file iceberg/schema_field.h
/// A (schema) field is a name and a type and is part of a schema or nested
/// type (e.g. a struct).

#include <cstdint>
#include <memory>
#include <string>
#include <string_view>

#include "iceberg/iceberg_export.h"
#include "iceberg/type_fwd.h"
#include "iceberg/util/formattable.h"

namespace iceberg {

/// \brief A type combined with a name.
class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable {
public:
/// \brief Construct a field.
/// \param[in] field_id The field ID.
/// \param[in] name The field name.
/// \param[in] type The field type.
/// \param[in] optional Whether values of this field are required or nullable.
SchemaField(int32_t field_id, std::string name, std::shared_ptr<Type> type,
bool optional);

/// \brief Construct an optional (nullable) field.
static SchemaField MakeOptional(int32_t field_id, std::string name,
std::shared_ptr<Type> type);
/// \brief Construct a required (non-null) field.
static SchemaField MakeRequired(int32_t field_id, std::string name,
std::shared_ptr<Type> type);

/// \brief Get the field ID.
[[nodiscard]] int32_t field_id() const;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it an overkill to add [[nodiscard]] to these accessors?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it hurts

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, it is just personal flavor. I have commented at #32 (comment)


/// \brief Get the field name.
[[nodiscard]] std::string_view name() const;

/// \brief Get the field type.
[[nodiscard]] const std::shared_ptr<Type>& type() const;

/// \brief Get whether the field is optional.
[[nodiscard]] bool optional() const;

[[nodiscard]] std::string ToString() const;

friend bool operator==(const SchemaField& lhs, const SchemaField& rhs) {
return lhs.Equals(rhs);
}

friend bool operator!=(const SchemaField& lhs, const SchemaField& rhs) {
return !(lhs == rhs);
}

private:
/// \brief Compare two fields for equality.
[[nodiscard]] bool Equals(const SchemaField& other) const;

int32_t field_id_;
std::string name_;
std::shared_ptr<Type> type_;
bool optional_;
};

} // namespace iceberg
Loading
Loading