Skip to content

Commit 4ebe732

Browse files
authored
fix: result type of DayTransform should be date (#285)
Result type of Day transform should be `int` per the spec, but in the Java impl, it is `DateType`, this discrepancy has been discussed on devlist, see [1]. iceberg-iceberg and iceberg-rust all conform to the java impl, so make iceberg-cpp do the same. The added comments are borrowed from iceberg-python. [1] https://lists.apache.org/thread/2gq7b54nvc9q6f1j08l9lnzgm5onkmx5
1 parent 1a6f9c8 commit 4ebe732

File tree

5 files changed

+64
-3
lines changed

5 files changed

+64
-3
lines changed

src/iceberg/test/partition_spec_test.cc

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424

2525
#include <gmock/gmock.h>
2626
#include <gtest/gtest.h>
27+
#include <nlohmann/json.hpp>
2728

29+
#include "iceberg/json_internal.h"
2830
#include "iceberg/partition_field.h"
2931
#include "iceberg/schema.h"
3032
#include "iceberg/schema_field.h"
@@ -106,4 +108,54 @@ TEST(PartitionSpecTest, PartitionSchemaTest) {
106108
EXPECT_EQ(pt_field2.name(), partition_schema.value()->fields()[1].name());
107109
EXPECT_EQ(pt_field2.field_id(), partition_schema.value()->fields()[1].field_id());
108110
}
111+
112+
TEST(PartitionSpecTest, PartitionTypeTest) {
113+
nlohmann::json json = R"(
114+
{
115+
"spec-id": 1,
116+
"fields": [ {
117+
"source-id": 4,
118+
"field-id": 1000,
119+
"name": "ts_day",
120+
"transform": "day"
121+
}, {
122+
"source-id": 1,
123+
"field-id": 1001,
124+
"name": "id_bucket",
125+
"transform": "bucket[16]"
126+
}, {
127+
"source-id": 2,
128+
"field-id": 1002,
129+
"name": "id_truncate",
130+
"transform": "truncate[4]"
131+
} ]
132+
})"_json;
133+
134+
SchemaField field1(1, "id", int32(), false);
135+
SchemaField field2(2, "name", string(), false);
136+
SchemaField field3(3, "ts", timestamp(), false);
137+
SchemaField field4(4, "ts_day", timestamp(), false);
138+
SchemaField field5(5, "id_bucket", int32(), false);
139+
SchemaField field6(6, "id_truncate", int32(), false);
140+
auto const schema = std::make_shared<Schema>(
141+
std::vector<SchemaField>{field1, field2, field3, field4, field5, field6},
142+
Schema::kInitialSchemaId);
143+
144+
auto parsed_spec_result = PartitionSpecFromJson(schema, json);
145+
ASSERT_TRUE(parsed_spec_result.has_value()) << parsed_spec_result.error().message;
146+
147+
auto partition_schema = parsed_spec_result.value()->PartitionType();
148+
149+
SchemaField pt_field1(1000, "ts_day", date(), true);
150+
SchemaField pt_field2(1001, "id_bucket", int32(), true);
151+
SchemaField pt_field3(1002, "id_truncate", string(), true);
152+
153+
ASSERT_TRUE(partition_schema.has_value());
154+
ASSERT_EQ(3, partition_schema.value()->fields().size());
155+
156+
EXPECT_EQ(pt_field1, partition_schema.value()->fields()[0]);
157+
EXPECT_EQ(pt_field2, partition_schema.value()->fields()[1]);
158+
EXPECT_EQ(pt_field3, partition_schema.value()->fields()[2]);
159+
}
160+
109161
} // namespace iceberg

src/iceberg/test/transform_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ TEST(TransformResultTypeTest, PositiveCases) {
134134
.expected_result_type = iceberg::int32()},
135135
{.str = "day",
136136
.source_type = iceberg::timestamp(),
137-
.expected_result_type = iceberg::int32()},
137+
.expected_result_type = iceberg::date()},
138138
{.str = "hour",
139139
.source_type = iceberg::timestamp(),
140140
.expected_result_type = iceberg::int32()},

src/iceberg/transform.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,11 @@ class ICEBERG_EXPORT TransformFunction {
202202
/// \brief Get the source type of transform function
203203
const std::shared_ptr<Type>& source_type() const;
204204
/// \brief Get the result type of transform function
205+
///
206+
/// Note: This method defines both the physical and display representation of the
207+
/// partition field. The physical representation must conform to the Iceberg spec. The
208+
/// display representation can deviate from the spec, such as by transforming the value
209+
/// into a more human-readable format.
205210
virtual std::shared_ptr<Type> ResultType() const = 0;
206211

207212
friend bool operator==(const TransformFunction& lhs, const TransformFunction& rhs) {

src/iceberg/transform_function.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ Result<Literal> DayTransform::Transform(const Literal& literal) {
193193
return TemporalUtils::ExtractDay(literal);
194194
}
195195

196-
std::shared_ptr<Type> DayTransform::ResultType() const { return int32(); }
196+
std::shared_ptr<Type> DayTransform::ResultType() const { return date(); }
197197

198198
Result<std::unique_ptr<TransformFunction>> DayTransform::Make(
199199
std::shared_ptr<Type> const& source_type) {

src/iceberg/transform_function.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,11 @@ class ICEBERG_EXPORT DayTransform : public TransformFunction {
141141
/// \brief Extract a date or timestamp day, as days from 1970-01-01.
142142
Result<Literal> Transform(const Literal& literal) override;
143143

144-
/// \brief Returns INT32 as the output type.
144+
/// \brief Return the result type of a day transform.
145+
///
146+
/// Note: The physical representation conforms to the Iceberg spec as DateType is
147+
/// internally converted to int. The DateType returned here provides a more
148+
/// human-readable way to display the partition field.
145149
std::shared_ptr<Type> ResultType() const override;
146150

147151
/// \brief Create a DayTransform.

0 commit comments

Comments
 (0)