Skip to content

Commit 7fc341c

Browse files
committed
deduce metadata location based on table location
1 parent 184963d commit 7fc341c

File tree

3 files changed

+83
-4
lines changed

3 files changed

+83
-4
lines changed

src/Databases/DataLake/GlueCatalog.cpp

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -317,11 +317,22 @@ bool GlueCatalog::tryGetTableMetadata(
317317
{
318318
result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = table_params.at("metadata_location")});
319319
}
320+
else if (const auto & location = table_outcome.GetStorageDescriptor().GetLocation(); !location.empty())
321+
{
322+
std::string location_with_slash = location;
323+
if (!location_with_slash.ends_with('/'))
324+
location_with_slash += '/';
325+
326+
// Resolve the actual metadata file path based on table location
327+
std::string resolved_metadata_path = resolveMetadataPathFromTableLocation(location_with_slash, result);
328+
result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = resolved_metadata_path});
329+
}
330+
320331
else
321332
{
322-
result.setTableIsNotReadable(fmt::format("Cannot read table `{}` because it has no metadata_location. " \
323-
"It means that it's unreadable with Glue catalog in ClickHouse, readable tables must have 'metadata_location' in table parameters",
324-
database_name + "." + table_name));
333+
result.setTableIsNotReadable(fmt::format("Cannot read table `{}` because it has no metadata_location. " \
334+
"It means that it's unreadable with Glue catalog in ClickHouse, readable tables must have 'metadata_location' in table parameters",
335+
database_name + "." + table_name));
325336
}
326337
};
327338

@@ -424,7 +435,7 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet
424435
if (metadata_path.starts_with("s3:/"))
425436
metadata_path = metadata_path.substr(5);
426437

427-
// Delete bucket
438+
// Delete bucket from path
428439
std::size_t pos = metadata_path.find('/');
429440
if (pos != std::string::npos)
430441
metadata_path = metadata_path.substr(pos + 1);
@@ -491,6 +502,69 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet
491502
return false;
492503
}
493504

505+
String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_location, const TableMetadata & table_metadata) const
506+
{
507+
// Construct path to version-hint.text
508+
String version_hint_path = table_location + "metadata/version-hint.text";
509+
510+
DB::ASTStorage * storage = table_engine_definition->as<DB::ASTStorage>();
511+
DB::ASTs args = storage->engine->arguments->children;
512+
513+
String storage_endpoint = !settings.storage_endpoint.empty() ? settings.storage_endpoint : table_location;
514+
if (args.empty())
515+
args.emplace_back(std::make_shared<DB::ASTLiteral>(storage_endpoint));
516+
else
517+
args[0] = std::make_shared<DB::ASTLiteral>(storage_endpoint);
518+
519+
if (args.size() == 1 && table_metadata.hasStorageCredentials())
520+
{
521+
auto storage_credentials = table_metadata.getStorageCredentials();
522+
if (storage_credentials)
523+
storage_credentials->addCredentialsToEngineArgs(args);
524+
}
525+
526+
auto storage_settings = std::make_shared<DB::DataLakeStorageSettings>();
527+
storage_settings->loadFromSettingsChanges(settings.allChanged());
528+
auto configuration = std::make_shared<DB::StorageS3IcebergConfiguration>(storage_settings);
529+
DB::StorageObjectStorageConfiguration::initialize(*configuration, args, getContext(), false);
530+
531+
auto object_storage = configuration->createObjectStorage(getContext(), true);
532+
const auto & read_settings = getContext()->getReadSettings();
533+
534+
try
535+
{
536+
// Try to read version-hint.text to get the latest version
537+
String version_hint_object_path = version_hint_path;
538+
if (version_hint_object_path.starts_with("s3://"))
539+
{
540+
version_hint_object_path = version_hint_object_path.substr(5);
541+
// Remove bucket from path
542+
std::size_t pos = version_hint_object_path.find('/');
543+
if (pos != std::string::npos)
544+
version_hint_object_path = version_hint_object_path.substr(pos + 1);
545+
}
546+
547+
DB::StoredObject version_hint_stored_object(version_hint_object_path);
548+
auto version_hint_buf = object_storage->readObject(version_hint_stored_object, read_settings);
549+
String version_str;
550+
readString(version_str, *version_hint_buf);
551+
552+
// Trim whitespace
553+
boost::algorithm::trim(version_str);
554+
555+
LOG_TRACE(log, "Read version {} from version-hint.text for table location '{}'", version_str, table_location);
556+
557+
// Construct metadata file path: table_location/metadata/v{version}-metadata.json
558+
return table_location + "metadata/v" + version_str + "-metadata.json";
559+
}
560+
catch (...)
561+
{
562+
// If version-hint.text doesn't exist or is unreadable, fall back to metadata.json
563+
LOG_TRACE(log, "Could not read version-hint.text from '{}', falling back to metadata.json", version_hint_path);
564+
return table_location + "metadata/metadata.json";
565+
}
566+
}
567+
494568
void GlueCatalog::createNamespaceIfNotExists(const String & namespace_name) const
495569
{
496570
Aws::Glue::Model::CreateDatabaseRequest create_request;

src/Databases/DataLake/GlueCatalog.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ class GlueCatalog final : public ICatalog, private DB::WithContext
8181
/// This method allows to clarify the actual type of the timestamp column.
8282
bool classifyTimestampTZ(const String & column_name, const TableMetadata & table_metadata) const;
8383

84+
String resolveMetadataPathFromTableLocation(const String & table_location, const TableMetadata & table_metadata) const;
85+
8486
mutable DB::CacheBase<String, Poco::JSON::Object::Ptr> metadata_objects;
8587
};
8688

src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ If enabled, indicates that metadata is taken from iceberg specification that is
5151
)", 0) \
5252
DECLARE(String, iceberg_metadata_file_path, "", R"(
5353
Explicit path to desired Iceberg metadata file, should be relative to path in object storage. Make sense for table function use case only.
54+
)", 0) \
55+
DECLARE(String, iceberg_table_location, "", R"(
56+
Explicit path to Iceberg table location (warehouse). If no iceberg_metadata_file_path provided, it will be deduced using this parameter.
5457
)", 0) \
5558
DECLARE(String, iceberg_metadata_table_uuid, "", R"(
5659
Explicit table UUID to read metadata for. Ignored if iceberg_metadata_file_path is set.

0 commit comments

Comments
 (0)