Skip to content

Commit e47f972

Browse files
authored
feat: add json serialization for schema (#65)
1 parent 2819128 commit e47f972

File tree

6 files changed

+509
-5
lines changed

6 files changed

+509
-5
lines changed

src/iceberg/error.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ enum class ErrorKind {
3838
kNotImplemented,
3939
kUnknownError,
4040
kNotSupported,
41+
kJsonParseError,
4142
};
4243

4344
/// \brief Error with a kind and a message.

src/iceberg/schema_internal.cc

Lines changed: 293 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,49 @@
2222
#include <cstring>
2323
#include <format>
2424
#include <optional>
25+
#include <regex>
2526
#include <string>
2627

28+
#include <nlohmann/json.hpp>
29+
2730
#include "iceberg/expected.h"
2831
#include "iceberg/schema.h"
32+
#include "iceberg/type.h"
33+
#include "iceberg/util/macros.h"
2934

3035
namespace iceberg {
3136

3237
namespace {
3338

39+
// Constants for Arrow schema metadata
3440
constexpr const char* kArrowExtensionName = "ARROW:extension:name";
3541
constexpr const char* kArrowExtensionMetadata = "ARROW:extension:metadata";
3642
constexpr const char* kArrowUuidExtensionName = "arrow.uuid";
3743
constexpr int32_t kUnknownFieldId = -1;
3844

45+
// Constants for schema json serialization
46+
constexpr std::string_view kSchemaId = "schema-id";
47+
constexpr std::string_view kIdentifierFieldIds = "identifier-field-ids";
48+
constexpr std::string_view kType = "type";
49+
constexpr std::string_view kStruct = "struct";
50+
constexpr std::string_view kList = "list";
51+
constexpr std::string_view kMap = "map";
52+
constexpr std::string_view kFields = "fields";
53+
constexpr std::string_view kElement = "element";
54+
constexpr std::string_view kKey = "key";
55+
constexpr std::string_view kValue = "value";
56+
constexpr std::string_view kDoc = "doc";
57+
constexpr std::string_view kName = "name";
58+
constexpr std::string_view kId = "id";
59+
constexpr std::string_view kInitialDefault = "initial-default";
60+
constexpr std::string_view kWriteDefault = "write-default";
61+
constexpr std::string_view kElementId = "element-id";
62+
constexpr std::string_view kKeyId = "key-id";
63+
constexpr std::string_view kValueId = "value-id";
64+
constexpr std::string_view kRequired = "required";
65+
constexpr std::string_view kElementRequired = "element-required";
66+
constexpr std::string_view kValueRequired = "value-required";
67+
3968
// Convert an Iceberg type to Arrow schema. Return value is Nanoarrow error code.
4069
ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view name,
4170
std::optional<int32_t> field_id, ArrowSchema* schema) {
@@ -328,6 +357,15 @@ expected<std::shared_ptr<Type>, Error> FromArrowSchema(const ArrowSchema& schema
328357
}
329358
}
330359

360+
std::unique_ptr<Schema> FromStructType(StructType&& struct_type, int32_t schema_id) {
361+
std::vector<SchemaField> fields;
362+
fields.reserve(struct_type.fields().size());
363+
for (auto& field : struct_type.fields()) {
364+
fields.emplace_back(std::move(field));
365+
}
366+
return std::make_unique<Schema>(schema_id, std::move(fields));
367+
}
368+
331369
} // namespace
332370

333371
expected<std::unique_ptr<Schema>, Error> FromArrowSchema(const ArrowSchema& schema,
@@ -344,13 +382,263 @@ expected<std::unique_ptr<Schema>, Error> FromArrowSchema(const ArrowSchema& sche
344382
.message = "Arrow schema must be a struct type for Iceberg schema"}};
345383
}
346384

347-
auto* struct_type = static_cast<StructType*>(type.get());
385+
auto& struct_type = static_cast<StructType&>(*type);
386+
return FromStructType(std::move(struct_type), schema_id);
387+
}
388+
389+
nlohmann::json FieldToJson(const SchemaField& field) {
390+
nlohmann::json json;
391+
json[kId] = field.field_id();
392+
json[kName] = field.name();
393+
json[kRequired] = !field.optional();
394+
json[kType] = TypeToJson(*field.type());
395+
return json;
396+
}
397+
398+
nlohmann::json TypeToJson(const Type& type) {
399+
switch (type.type_id()) {
400+
case TypeId::kStruct: {
401+
const auto& struct_type = static_cast<const StructType&>(type);
402+
nlohmann::json json;
403+
json[kType] = kStruct;
404+
nlohmann::json fields_json = nlohmann::json::array();
405+
for (const auto& field : struct_type.fields()) {
406+
fields_json.push_back(FieldToJson(field));
407+
// TODO(gangwu): add default values
408+
}
409+
json[kFields] = fields_json;
410+
return json;
411+
}
412+
case TypeId::kList: {
413+
const auto& list_type = static_cast<const ListType&>(type);
414+
nlohmann::json json;
415+
json[kType] = kList;
416+
417+
const auto& element_field = list_type.fields().front();
418+
json[kElementId] = element_field.field_id();
419+
json[kElementRequired] = !element_field.optional();
420+
json[kElement] = TypeToJson(*element_field.type());
421+
return json;
422+
}
423+
case TypeId::kMap: {
424+
const auto& map_type = static_cast<const MapType&>(type);
425+
nlohmann::json json;
426+
json[std::string(kType)] = kMap;
427+
428+
const auto& key_field = map_type.key();
429+
json[kKeyId] = key_field.field_id();
430+
json[kKey] = TypeToJson(*key_field.type());
431+
432+
const auto& value_field = map_type.value();
433+
json[kValueId] = value_field.field_id();
434+
json[kValueRequired] = !value_field.optional();
435+
json[kValue] = TypeToJson(*value_field.type());
436+
return json;
437+
}
438+
case TypeId::kBoolean:
439+
return "boolean";
440+
case TypeId::kInt:
441+
return "int";
442+
case TypeId::kLong:
443+
return "long";
444+
case TypeId::kFloat:
445+
return "float";
446+
case TypeId::kDouble:
447+
return "double";
448+
case TypeId::kDecimal: {
449+
const auto& decimal_type = static_cast<const DecimalType&>(type);
450+
return std::format("decimal({},{})", decimal_type.precision(),
451+
decimal_type.scale());
452+
}
453+
case TypeId::kDate:
454+
return "date";
455+
case TypeId::kTime:
456+
return "time";
457+
case TypeId::kTimestamp:
458+
return "timestamp";
459+
case TypeId::kTimestampTz:
460+
return "timestamptz";
461+
case TypeId::kString:
462+
return "string";
463+
case TypeId::kBinary:
464+
return "binary";
465+
case TypeId::kFixed: {
466+
const auto& fixed_type = static_cast<const FixedType&>(type);
467+
return std::format("fixed[{}]", fixed_type.length());
468+
}
469+
case TypeId::kUuid:
470+
return "uuid";
471+
}
472+
}
473+
474+
nlohmann::json SchemaToJson(const Schema& schema) {
475+
nlohmann::json json = TypeToJson(static_cast<const Type&>(schema));
476+
json[kSchemaId] = schema.schema_id();
477+
// TODO(gangwu): add identifier-field-ids.
478+
return json;
479+
}
480+
481+
namespace {
482+
483+
#define ICEBERG_CHECK_JSON_FIELD(field_name, json) \
484+
if (!json.contains(field_name)) [[unlikely]] { \
485+
return unexpected<Error>({ \
486+
.kind = ErrorKind::kJsonParseError, \
487+
.message = std::format("Missing '{}' in {}", field_name, json.dump()), \
488+
}); \
489+
}
490+
491+
expected<std::unique_ptr<Type>, Error> StructTypeFromJson(const nlohmann::json& json) {
492+
ICEBERG_CHECK_JSON_FIELD(kFields, json);
493+
348494
std::vector<SchemaField> fields;
349-
fields.reserve(struct_type->fields().size());
350-
for (auto& field : struct_type->fields()) {
351-
fields.emplace_back(std::move(field));
495+
for (const auto& field_json : json[kFields]) {
496+
ICEBERG_ASSIGN_OR_RAISE(auto field, FieldFromJson(field_json));
497+
fields.emplace_back(std::move(*field));
352498
}
353-
return std::make_unique<Schema>(schema_id, std::move(fields));
499+
500+
return std::make_unique<StructType>(std::move(fields));
501+
}
502+
503+
expected<std::unique_ptr<Type>, Error> ListTypeFromJson(const nlohmann::json& json) {
504+
ICEBERG_CHECK_JSON_FIELD(kElement, json);
505+
ICEBERG_CHECK_JSON_FIELD(kElementId, json);
506+
ICEBERG_CHECK_JSON_FIELD(kElementRequired, json);
507+
508+
ICEBERG_ASSIGN_OR_RAISE(auto element_type, TypeFromJson(json[kElement]));
509+
int32_t element_id = json[kElementId].get<int32_t>();
510+
bool element_required = json[kElementRequired].get<bool>();
511+
512+
return std::make_unique<ListType>(
513+
SchemaField(element_id, std::string(ListType::kElementName),
514+
std::move(element_type), !element_required));
515+
}
516+
517+
expected<std::unique_ptr<Type>, Error> MapTypeFromJson(const nlohmann::json& json) {
518+
ICEBERG_CHECK_JSON_FIELD(kKey, json);
519+
ICEBERG_CHECK_JSON_FIELD(kValue, json);
520+
ICEBERG_CHECK_JSON_FIELD(kKeyId, json);
521+
ICEBERG_CHECK_JSON_FIELD(kValueId, json);
522+
ICEBERG_CHECK_JSON_FIELD(kValueRequired, json);
523+
524+
ICEBERG_ASSIGN_OR_RAISE(auto key_type, TypeFromJson(json[kKey]));
525+
ICEBERG_ASSIGN_OR_RAISE(auto value_type, TypeFromJson(json[kValue]));
526+
int32_t key_id = json[kKeyId].get<int32_t>();
527+
int32_t value_id = json[kValueId].get<int32_t>();
528+
bool value_required = json[kValueRequired].get<bool>();
529+
530+
SchemaField key_field(key_id, std::string(MapType::kKeyName), std::move(key_type),
531+
/*optional=*/false);
532+
SchemaField value_field(value_id, std::string(MapType::kValueName),
533+
std::move(value_type), !value_required);
534+
return std::make_unique<MapType>(std::move(key_field), std::move(value_field));
535+
}
536+
537+
} // namespace
538+
539+
expected<std::unique_ptr<Type>, Error> TypeFromJson(const nlohmann::json& json) {
540+
if (json.is_string()) {
541+
std::string type_str = json.get<std::string>();
542+
if (type_str == "boolean") {
543+
return std::make_unique<BooleanType>();
544+
} else if (type_str == "int") {
545+
return std::make_unique<IntType>();
546+
} else if (type_str == "long") {
547+
return std::make_unique<LongType>();
548+
} else if (type_str == "float") {
549+
return std::make_unique<FloatType>();
550+
} else if (type_str == "double") {
551+
return std::make_unique<DoubleType>();
552+
} else if (type_str == "date") {
553+
return std::make_unique<DateType>();
554+
} else if (type_str == "time") {
555+
return std::make_unique<TimeType>();
556+
} else if (type_str == "timestamp") {
557+
return std::make_unique<TimestampType>();
558+
} else if (type_str == "timestamptz") {
559+
return std::make_unique<TimestampTzType>();
560+
} else if (type_str == "string") {
561+
return std::make_unique<StringType>();
562+
} else if (type_str == "binary") {
563+
return std::make_unique<BinaryType>();
564+
} else if (type_str == "uuid") {
565+
return std::make_unique<UuidType>();
566+
} else if (type_str.starts_with("fixed")) {
567+
std::regex fixed_regex(R"(fixed\[\s*(\d+)\s*\])");
568+
std::smatch match;
569+
if (std::regex_match(type_str, match, fixed_regex)) {
570+
return std::make_unique<FixedType>(std::stoi(match[1].str()));
571+
}
572+
return unexpected<Error>({
573+
.kind = ErrorKind::kJsonParseError,
574+
.message = std::format("Invalid fixed type: {}", type_str),
575+
});
576+
} else if (type_str.starts_with("decimal")) {
577+
std::regex decimal_regex(R"(decimal\(\s*(\d+)\s*,\s*(\d+)\s*\))");
578+
std::smatch match;
579+
if (std::regex_match(type_str, match, decimal_regex)) {
580+
return std::make_unique<DecimalType>(std::stoi(match[1].str()),
581+
std::stoi(match[2].str()));
582+
}
583+
return unexpected<Error>({
584+
.kind = ErrorKind::kJsonParseError,
585+
.message = std::format("Invalid decimal type: {}", type_str),
586+
});
587+
} else {
588+
return unexpected<Error>({
589+
.kind = ErrorKind::kJsonParseError,
590+
.message = std::format("Unknown primitive type: {}", type_str),
591+
});
592+
}
593+
}
594+
595+
// For complex types like struct, list, and map
596+
ICEBERG_CHECK_JSON_FIELD(kType, json);
597+
std::string type_str = json[kType].get<std::string>();
598+
if (type_str == kStruct) {
599+
return StructTypeFromJson(json);
600+
} else if (type_str == kList) {
601+
return ListTypeFromJson(json);
602+
} else if (type_str == kMap) {
603+
return MapTypeFromJson(json);
604+
} else {
605+
return unexpected<Error>({
606+
.kind = ErrorKind::kJsonParseError,
607+
.message = std::format("Unknown complex type: {}", type_str),
608+
});
609+
}
610+
}
611+
612+
expected<std::unique_ptr<SchemaField>, Error> FieldFromJson(const nlohmann::json& json) {
613+
ICEBERG_CHECK_JSON_FIELD(kId, json);
614+
ICEBERG_CHECK_JSON_FIELD(kName, json);
615+
ICEBERG_CHECK_JSON_FIELD(kType, json);
616+
ICEBERG_CHECK_JSON_FIELD(kRequired, json);
617+
618+
ICEBERG_ASSIGN_OR_RAISE(auto type, TypeFromJson(json[kType]));
619+
int32_t field_id = json[kId].get<int32_t>();
620+
std::string name = json[kName].get<std::string>();
621+
bool required = json[kRequired].get<bool>();
622+
623+
return std::make_unique<SchemaField>(field_id, std::move(name), std::move(type),
624+
!required);
625+
}
626+
627+
expected<std::unique_ptr<Schema>, Error> SchemaFromJson(const nlohmann::json& json) {
628+
ICEBERG_CHECK_JSON_FIELD(kType, json);
629+
ICEBERG_CHECK_JSON_FIELD(kSchemaId, json);
630+
631+
ICEBERG_ASSIGN_OR_RAISE(auto type, TypeFromJson(json));
632+
if (type->type_id() != TypeId::kStruct) [[unlikely]] {
633+
return unexpected<Error>({
634+
.kind = ErrorKind::kJsonParseError,
635+
.message = std::format("Schema must be a struct type, but got {}", json.dump()),
636+
});
637+
}
638+
639+
int32_t schema_id = json[kSchemaId].get<int32_t>();
640+
auto& struct_type = static_cast<StructType&>(*type);
641+
return FromStructType(std::move(struct_type), schema_id);
354642
}
355643

356644
} // namespace iceberg

src/iceberg/schema_internal.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <memory>
2323

2424
#include <nanoarrow/nanoarrow.h>
25+
#include <nlohmann/json_fwd.hpp>
2526

2627
#include "iceberg/error.h"
2728
#include "iceberg/expected.h"
@@ -49,4 +50,40 @@ expected<void, Error> ToArrowSchema(const Schema& schema, ArrowSchema* out);
4950
expected<std::unique_ptr<Schema>, Error> FromArrowSchema(const ArrowSchema& schema,
5051
int32_t schema_id);
5152

53+
/// \brief Convert an Iceberg Schema to JSON.
54+
///
55+
/// \param[in] schema The Iceberg schema to convert.
56+
/// \return The JSON representation of the schema.
57+
nlohmann::json SchemaToJson(const Schema& schema);
58+
59+
/// \brief Convert an Iceberg Type to JSON.
60+
///
61+
/// \param[in] type The Iceberg type to convert.
62+
/// \return The JSON representation of the type.
63+
nlohmann::json TypeToJson(const Type& type);
64+
65+
/// \brief Convert an Iceberg SchemaField to JSON.
66+
///
67+
/// \param[in] field The Iceberg field to convert.
68+
/// \return The JSON representation of the field.
69+
nlohmann::json FieldToJson(const SchemaField& field);
70+
71+
/// \brief Convert JSON to an Iceberg Schema.
72+
///
73+
/// \param[in] json The JSON representation of the schema.
74+
/// \return The Iceberg schema or an error if the conversion fails.
75+
expected<std::unique_ptr<Schema>, Error> SchemaFromJson(const nlohmann::json& json);
76+
77+
/// \brief Convert JSON to an Iceberg Type.
78+
///
79+
/// \param[in] json The JSON representation of the type.
80+
/// \return The Iceberg type or an error if the conversion fails.
81+
expected<std::unique_ptr<Type>, Error> TypeFromJson(const nlohmann::json& json);
82+
83+
/// \brief Convert JSON to an Iceberg SchemaField.
84+
///
85+
/// \param[in] json The JSON representation of the field.
86+
/// \return The Iceberg field or an error if the conversion fails.
87+
expected<std::unique_ptr<SchemaField>, Error> FieldFromJson(const nlohmann::json& json);
88+
5289
} // namespace iceberg

0 commit comments

Comments
 (0)