2222#include < cstring>
2323#include < format>
2424#include < optional>
25+ #include < regex>
2526#include < string>
2627
28+ #include < nlohmann/json.hpp>
29+
2730#include " iceberg/expected.h"
2831#include " iceberg/schema.h"
32+ #include " iceberg/type.h"
33+ #include " iceberg/util/macros.h"
2934
3035namespace iceberg {
3136
3237namespace {
3338
39+ // Constants for Arrow schema metadata
3440constexpr const char * kArrowExtensionName = " ARROW:extension:name" ;
3541constexpr const char * kArrowExtensionMetadata = " ARROW:extension:metadata" ;
3642constexpr const char * kArrowUuidExtensionName = " arrow.uuid" ;
3743constexpr int32_t kUnknownFieldId = -1 ;
3844
45+ // Constants for schema json serialization
46+ constexpr std::string_view kSchemaId = " schema-id" ;
47+ constexpr std::string_view kIdentifierFieldIds = " identifier-field-ids" ;
48+ constexpr std::string_view kType = " type" ;
49+ constexpr std::string_view kStruct = " struct" ;
50+ constexpr std::string_view kList = " list" ;
51+ constexpr std::string_view kMap = " map" ;
52+ constexpr std::string_view kFields = " fields" ;
53+ constexpr std::string_view kElement = " element" ;
54+ constexpr std::string_view kKey = " key" ;
55+ constexpr std::string_view kValue = " value" ;
56+ constexpr std::string_view kDoc = " doc" ;
57+ constexpr std::string_view kName = " name" ;
58+ constexpr std::string_view kId = " id" ;
59+ constexpr std::string_view kInitialDefault = " initial-default" ;
60+ constexpr std::string_view kWriteDefault = " write-default" ;
61+ constexpr std::string_view kElementId = " element-id" ;
62+ constexpr std::string_view kKeyId = " key-id" ;
63+ constexpr std::string_view kValueId = " value-id" ;
64+ constexpr std::string_view kRequired = " required" ;
65+ constexpr std::string_view kElementRequired = " element-required" ;
66+ constexpr std::string_view kValueRequired = " value-required" ;
67+
3968// Convert an Iceberg type to Arrow schema. Return value is Nanoarrow error code.
4069ArrowErrorCode ToArrowSchema (const Type& type, bool optional, std::string_view name,
4170 std::optional<int32_t > field_id, ArrowSchema* schema) {
@@ -328,6 +357,15 @@ expected<std::shared_ptr<Type>, Error> FromArrowSchema(const ArrowSchema& schema
328357 }
329358}
330359
360+ std::unique_ptr<Schema> FromStructType (StructType&& struct_type, int32_t schema_id) {
361+ std::vector<SchemaField> fields;
362+ fields.reserve (struct_type.fields ().size ());
363+ for (auto & field : struct_type.fields ()) {
364+ fields.emplace_back (std::move (field));
365+ }
366+ return std::make_unique<Schema>(schema_id, std::move (fields));
367+ }
368+
331369} // namespace
332370
333371expected<std::unique_ptr<Schema>, Error> FromArrowSchema (const ArrowSchema& schema,
@@ -344,13 +382,263 @@ expected<std::unique_ptr<Schema>, Error> FromArrowSchema(const ArrowSchema& sche
344382 .message = " Arrow schema must be a struct type for Iceberg schema" }};
345383 }
346384
347- auto * struct_type = static_cast <StructType*>(type.get ());
385+ auto & struct_type = static_cast <StructType&>(*type);
386+ return FromStructType (std::move (struct_type), schema_id);
387+ }
388+
389+ nlohmann::json FieldToJson (const SchemaField& field) {
390+ nlohmann::json json;
391+ json[kId ] = field.field_id ();
392+ json[kName ] = field.name ();
393+ json[kRequired ] = !field.optional ();
394+ json[kType ] = TypeToJson (*field.type ());
395+ return json;
396+ }
397+
398+ nlohmann::json TypeToJson (const Type& type) {
399+ switch (type.type_id ()) {
400+ case TypeId::kStruct : {
401+ const auto & struct_type = static_cast <const StructType&>(type);
402+ nlohmann::json json;
403+ json[kType ] = kStruct ;
404+ nlohmann::json fields_json = nlohmann::json::array ();
405+ for (const auto & field : struct_type.fields ()) {
406+ fields_json.push_back (FieldToJson (field));
407+ // TODO(gangwu): add default values
408+ }
409+ json[kFields ] = fields_json;
410+ return json;
411+ }
412+ case TypeId::kList : {
413+ const auto & list_type = static_cast <const ListType&>(type);
414+ nlohmann::json json;
415+ json[kType ] = kList ;
416+
417+ const auto & element_field = list_type.fields ().front ();
418+ json[kElementId ] = element_field.field_id ();
419+ json[kElementRequired ] = !element_field.optional ();
420+ json[kElement ] = TypeToJson (*element_field.type ());
421+ return json;
422+ }
423+ case TypeId::kMap : {
424+ const auto & map_type = static_cast <const MapType&>(type);
425+ nlohmann::json json;
426+ json[std::string (kType )] = kMap ;
427+
428+ const auto & key_field = map_type.key ();
429+ json[kKeyId ] = key_field.field_id ();
430+ json[kKey ] = TypeToJson (*key_field.type ());
431+
432+ const auto & value_field = map_type.value ();
433+ json[kValueId ] = value_field.field_id ();
434+ json[kValueRequired ] = !value_field.optional ();
435+ json[kValue ] = TypeToJson (*value_field.type ());
436+ return json;
437+ }
438+ case TypeId::kBoolean :
439+ return " boolean" ;
440+ case TypeId::kInt :
441+ return " int" ;
442+ case TypeId::kLong :
443+ return " long" ;
444+ case TypeId::kFloat :
445+ return " float" ;
446+ case TypeId::kDouble :
447+ return " double" ;
448+ case TypeId::kDecimal : {
449+ const auto & decimal_type = static_cast <const DecimalType&>(type);
450+ return std::format (" decimal({},{})" , decimal_type.precision (),
451+ decimal_type.scale ());
452+ }
453+ case TypeId::kDate :
454+ return " date" ;
455+ case TypeId::kTime :
456+ return " time" ;
457+ case TypeId::kTimestamp :
458+ return " timestamp" ;
459+ case TypeId::kTimestampTz :
460+ return " timestamptz" ;
461+ case TypeId::kString :
462+ return " string" ;
463+ case TypeId::kBinary :
464+ return " binary" ;
465+ case TypeId::kFixed : {
466+ const auto & fixed_type = static_cast <const FixedType&>(type);
467+ return std::format (" fixed[{}]" , fixed_type.length ());
468+ }
469+ case TypeId::kUuid :
470+ return " uuid" ;
471+ }
472+ }
473+
474+ nlohmann::json SchemaToJson (const Schema& schema) {
475+ nlohmann::json json = TypeToJson (static_cast <const Type&>(schema));
476+ json[kSchemaId ] = schema.schema_id ();
477+ // TODO(gangwu): add identifier-field-ids.
478+ return json;
479+ }
480+
481+ namespace {
482+
483+ #define ICEBERG_CHECK_JSON_FIELD (field_name, json ) \
484+ if (!json.contains(field_name)) [[unlikely]] { \
485+ return unexpected<Error>({ \
486+ .kind = ErrorKind::kJsonParseError , \
487+ .message = std::format (" Missing '{}' in {}" , field_name, json.dump ()), \
488+ }); \
489+ }
490+
491+ expected<std::unique_ptr<Type>, Error> StructTypeFromJson (const nlohmann::json& json) {
492+ ICEBERG_CHECK_JSON_FIELD (kFields , json);
493+
348494 std::vector<SchemaField> fields;
349- fields. reserve (struct_type-> fields (). size ());
350- for (auto & field : struct_type-> fields ()) {
351- fields.emplace_back (std::move (field));
495+ for ( const auto & field_json : json[ kFields ]) {
496+ ICEBERG_ASSIGN_OR_RAISE (auto field, FieldFromJson (field_json));
497+ fields.emplace_back (std::move (* field));
352498 }
353- return std::make_unique<Schema>(schema_id, std::move (fields));
499+
500+ return std::make_unique<StructType>(std::move (fields));
501+ }
502+
503+ expected<std::unique_ptr<Type>, Error> ListTypeFromJson (const nlohmann::json& json) {
504+ ICEBERG_CHECK_JSON_FIELD (kElement , json);
505+ ICEBERG_CHECK_JSON_FIELD (kElementId , json);
506+ ICEBERG_CHECK_JSON_FIELD (kElementRequired , json);
507+
508+ ICEBERG_ASSIGN_OR_RAISE (auto element_type, TypeFromJson (json[kElement ]));
509+ int32_t element_id = json[kElementId ].get <int32_t >();
510+ bool element_required = json[kElementRequired ].get <bool >();
511+
512+ return std::make_unique<ListType>(
513+ SchemaField (element_id, std::string (ListType::kElementName ),
514+ std::move (element_type), !element_required));
515+ }
516+
517+ expected<std::unique_ptr<Type>, Error> MapTypeFromJson (const nlohmann::json& json) {
518+ ICEBERG_CHECK_JSON_FIELD (kKey , json);
519+ ICEBERG_CHECK_JSON_FIELD (kValue , json);
520+ ICEBERG_CHECK_JSON_FIELD (kKeyId , json);
521+ ICEBERG_CHECK_JSON_FIELD (kValueId , json);
522+ ICEBERG_CHECK_JSON_FIELD (kValueRequired , json);
523+
524+ ICEBERG_ASSIGN_OR_RAISE (auto key_type, TypeFromJson (json[kKey ]));
525+ ICEBERG_ASSIGN_OR_RAISE (auto value_type, TypeFromJson (json[kValue ]));
526+ int32_t key_id = json[kKeyId ].get <int32_t >();
527+ int32_t value_id = json[kValueId ].get <int32_t >();
528+ bool value_required = json[kValueRequired ].get <bool >();
529+
530+ SchemaField key_field (key_id, std::string (MapType::kKeyName ), std::move (key_type),
531+ /* optional=*/ false );
532+ SchemaField value_field (value_id, std::string (MapType::kValueName ),
533+ std::move (value_type), !value_required);
534+ return std::make_unique<MapType>(std::move (key_field), std::move (value_field));
535+ }
536+
537+ } // namespace
538+
539+ expected<std::unique_ptr<Type>, Error> TypeFromJson (const nlohmann::json& json) {
540+ if (json.is_string ()) {
541+ std::string type_str = json.get <std::string>();
542+ if (type_str == " boolean" ) {
543+ return std::make_unique<BooleanType>();
544+ } else if (type_str == " int" ) {
545+ return std::make_unique<IntType>();
546+ } else if (type_str == " long" ) {
547+ return std::make_unique<LongType>();
548+ } else if (type_str == " float" ) {
549+ return std::make_unique<FloatType>();
550+ } else if (type_str == " double" ) {
551+ return std::make_unique<DoubleType>();
552+ } else if (type_str == " date" ) {
553+ return std::make_unique<DateType>();
554+ } else if (type_str == " time" ) {
555+ return std::make_unique<TimeType>();
556+ } else if (type_str == " timestamp" ) {
557+ return std::make_unique<TimestampType>();
558+ } else if (type_str == " timestamptz" ) {
559+ return std::make_unique<TimestampTzType>();
560+ } else if (type_str == " string" ) {
561+ return std::make_unique<StringType>();
562+ } else if (type_str == " binary" ) {
563+ return std::make_unique<BinaryType>();
564+ } else if (type_str == " uuid" ) {
565+ return std::make_unique<UuidType>();
566+ } else if (type_str.starts_with (" fixed" )) {
567+ std::regex fixed_regex (R"( fixed\[\s*(\d+)\s*\])" );
568+ std::smatch match;
569+ if (std::regex_match (type_str, match, fixed_regex)) {
570+ return std::make_unique<FixedType>(std::stoi (match[1 ].str ()));
571+ }
572+ return unexpected<Error>({
573+ .kind = ErrorKind::kJsonParseError ,
574+ .message = std::format (" Invalid fixed type: {}" , type_str),
575+ });
576+ } else if (type_str.starts_with (" decimal" )) {
577+ std::regex decimal_regex (R"( decimal\(\s*(\d+)\s*,\s*(\d+)\s*\))" );
578+ std::smatch match;
579+ if (std::regex_match (type_str, match, decimal_regex)) {
580+ return std::make_unique<DecimalType>(std::stoi (match[1 ].str ()),
581+ std::stoi (match[2 ].str ()));
582+ }
583+ return unexpected<Error>({
584+ .kind = ErrorKind::kJsonParseError ,
585+ .message = std::format (" Invalid decimal type: {}" , type_str),
586+ });
587+ } else {
588+ return unexpected<Error>({
589+ .kind = ErrorKind::kJsonParseError ,
590+ .message = std::format (" Unknown primitive type: {}" , type_str),
591+ });
592+ }
593+ }
594+
595+ // For complex types like struct, list, and map
596+ ICEBERG_CHECK_JSON_FIELD (kType , json);
597+ std::string type_str = json[kType ].get <std::string>();
598+ if (type_str == kStruct ) {
599+ return StructTypeFromJson (json);
600+ } else if (type_str == kList ) {
601+ return ListTypeFromJson (json);
602+ } else if (type_str == kMap ) {
603+ return MapTypeFromJson (json);
604+ } else {
605+ return unexpected<Error>({
606+ .kind = ErrorKind::kJsonParseError ,
607+ .message = std::format (" Unknown complex type: {}" , type_str),
608+ });
609+ }
610+ }
611+
612+ expected<std::unique_ptr<SchemaField>, Error> FieldFromJson (const nlohmann::json& json) {
613+ ICEBERG_CHECK_JSON_FIELD (kId , json);
614+ ICEBERG_CHECK_JSON_FIELD (kName , json);
615+ ICEBERG_CHECK_JSON_FIELD (kType , json);
616+ ICEBERG_CHECK_JSON_FIELD (kRequired , json);
617+
618+ ICEBERG_ASSIGN_OR_RAISE (auto type, TypeFromJson (json[kType ]));
619+ int32_t field_id = json[kId ].get <int32_t >();
620+ std::string name = json[kName ].get <std::string>();
621+ bool required = json[kRequired ].get <bool >();
622+
623+ return std::make_unique<SchemaField>(field_id, std::move (name), std::move (type),
624+ !required);
625+ }
626+
627+ expected<std::unique_ptr<Schema>, Error> SchemaFromJson (const nlohmann::json& json) {
628+ ICEBERG_CHECK_JSON_FIELD (kType , json);
629+ ICEBERG_CHECK_JSON_FIELD (kSchemaId , json);
630+
631+ ICEBERG_ASSIGN_OR_RAISE (auto type, TypeFromJson (json));
632+ if (type->type_id () != TypeId::kStruct ) [[unlikely]] {
633+ return unexpected<Error>({
634+ .kind = ErrorKind::kJsonParseError ,
635+ .message = std::format (" Schema must be a struct type, but got {}" , json.dump ()),
636+ });
637+ }
638+
639+ int32_t schema_id = json[kSchemaId ].get <int32_t >();
640+ auto & struct_type = static_cast <StructType&>(*type);
641+ return FromStructType (std::move (struct_type), schema_id);
354642}
355643
356644} // namespace iceberg
0 commit comments