Skip to content

Commit 9a7b6b5

Browse files
committed
Cut bucket from path
1 parent 6eceb35 commit 9a7b6b5

File tree

5 files changed

+26
-5
lines changed

5 files changed

+26
-5
lines changed

src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ void IcebergMetadata::updateSnapshot()
487487

488488
relevant_snapshot = IcebergSnapshot{
489489
getManifestList(getProperFilePathFromMetadataInfo(
490-
snapshot->getValue<String>(MANIFEST_LIST_PATH_FIELD), configuration_ptr->getPath(), table_location)),
490+
snapshot->getValue<String>(MANIFEST_LIST_PATH_FIELD), configuration_ptr->getPath(), table_location, configuration_ptr->getNamespace())),
491491
relevant_snapshot_id, total_rows, total_bytes};
492492

493493
if (!snapshot->has("schema-id"))
@@ -654,7 +654,7 @@ ManifestListPtr IcebergMetadata::getManifestList(const String & filename) const
654654
for (size_t i = 0; i < manifest_list_deserializer.rows(); ++i)
655655
{
656656
const std::string file_path = manifest_list_deserializer.getValueFromRowByName(i, MANIFEST_FILE_PATH_COLUMN, TypeIndex::String).safeGet<std::string>();
657-
const auto manifest_file_name = getProperFilePathFromMetadataInfo(file_path, configuration_ptr->getPath(), table_location);
657+
const auto manifest_file_name = getProperFilePathFromMetadataInfo(file_path, configuration_ptr->getPath(), table_location, configuration_ptr->getNamespace());
658658
Int64 added_sequence_number = 0;
659659
if (format_version > 1)
660660
added_sequence_number = manifest_list_deserializer.getValueFromRowByName(i, SEQUENCE_NUMBER_COLUMN, TypeIndex::Int64).safeGet<Int64>();
@@ -706,6 +706,7 @@ ManifestFilePtr IcebergMetadata::getManifestFile(const String & filename, Int64
706706
schema_processor,
707707
inherited_sequence_number,
708708
table_location,
709+
configuration_ptr->getNamespace(),
709710
context);
710711
};
711712

src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ ManifestFileContent::ManifestFileContent(
141141
const IcebergSchemaProcessor & schema_processor,
142142
Int64 inherited_sequence_number,
143143
const String & table_location,
144+
const String & common_namespace,
144145
DB::ContextPtr context)
145146
{
146147
this->schema_id = schema_id_;
@@ -205,7 +206,11 @@ ManifestFileContent::ManifestFileContent(
205206
}
206207
const auto status = ManifestEntryStatus(manifest_file_deserializer.getValueFromRowByName(i, COLUMN_STATUS_NAME, TypeIndex::Int32).safeGet<UInt64>());
207208

208-
const auto file_path = getProperFilePathFromMetadataInfo(manifest_file_deserializer.getValueFromRowByName(i, SUBCOLUMN_FILE_PATH_NAME, TypeIndex::String).safeGet<String>(), common_path, table_location);
209+
const auto file_path = getProperFilePathFromMetadataInfo(
210+
manifest_file_deserializer.getValueFromRowByName(i, SUBCOLUMN_FILE_PATH_NAME, TypeIndex::String).safeGet<String>(),
211+
common_path,
212+
table_location,
213+
common_namespace);
209214

210215
/// NOTE: This is weird, because in manifest file partition looks like this:
211216
/// {

src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ class ManifestFileContent
9696
const DB::IcebergSchemaProcessor & schema_processor,
9797
Int64 inherited_sequence_number,
9898
const std::string & table_location,
99+
const std::string & common_namespace,
99100
DB::ContextPtr context);
100101

101102
const std::vector<ManifestFileEntry> & getFiles() const;

src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@ using namespace DB;
2828
// This function is used to get the file path inside the directory which corresponds to iceberg table from the full blob path which is written in manifest and metadata files.
2929
// For example, if the full blob path is s3://bucket/table_name/data/00000-1-1234567890.avro, the function will return table_name/data/00000-1-1234567890.avro
3030
// Common path should end with "<table_name>" or "<table_name>/".
31-
std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location)
31+
std::string getProperFilePathFromMetadataInfo(
32+
std::string_view data_path,
33+
std::string_view common_path,
34+
std::string_view table_location,
35+
std::string_view common_namespace)
3236
{
3337
auto trim_backward_slash = [](std::string_view str) -> std::string_view
3438
{
@@ -91,6 +95,12 @@ std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::s
9195
pos = data_path.find("/", pos + 3);
9296
if (pos == std::string::npos)
9397
throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unexpected data path: '{}'", data_path);
98+
if (data_path.substr(pos + 1).starts_with(common_namespace))
99+
{
100+
auto new_pos = data_path.find("/", pos + 1);
101+
if (new_pos - pos == common_namespace.length() + 1) /// bucket in the path
102+
pos = new_pos;
103+
}
94104
return std::string(data_path.substr(pos));
95105
}
96106
}

src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010
namespace Iceberg
1111
{
1212

13-
std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location);
13+
std::string getProperFilePathFromMetadataInfo(
14+
std::string_view data_path,
15+
std::string_view common_path,
16+
std::string_view table_location,
17+
std::string_view common_namespace);
1418

1519
}
1620

0 commit comments

Comments
 (0)