Skip to content

Commit fc9d275

Browse files
authored
Merge pull request #492 from Tishj/fix_relativization_script
[Dev] Rework the relativization script, to actually make the paths relative
2 parents b65179f + c030d3e commit fc9d275

File tree

2 files changed

+46
-37
lines changed

2 files changed

+46
-37
lines changed

scripts/relativize_iceberg_table.py

Lines changed: 36 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
import shutil
77
import tempfile
88
import subprocess
9+
import re
910
from datetime import datetime
10-
from pathlib import Path
11+
from pathlib import Path, PurePosixPath
1112

1213

1314
def parse_args():
@@ -22,10 +23,12 @@ def abort(msg):
2223

2324

2425
def relativize_path(abs_path: str, new_base: str) -> str:
25-
if "/data/" in abs_path:
26-
return os.path.join(new_base, "data", abs_path.split("/data/", 1)[1])
27-
elif "/metadata/" in abs_path:
28-
return os.path.join(new_base, "metadata", abs_path.split("/metadata/", 1)[1])
26+
p = PurePosixPath(abs_path)
27+
parent = p.parent.name
28+
if parent in ("data", "metadata"):
29+
return os.path.join(new_base, parent, p.name)
30+
elif p.parent.parent.name == "index" and parent in ("data", "metadata"):
31+
return os.path.join(new_base, parent, p.name)
2932
else:
3033
return new_base
3134

@@ -47,40 +50,40 @@ def validate_paths(table_path: Path):
4750
def jq_rewrite_json(file_path: Path, output_path: Path, new_base: str):
4851
jq_filter = f'''
4952
def relativize_path(path; new_base):
50-
if (path | test(".*/index/data/.*")) then
51-
(path | sub(".*?/index/"; "")) as $p | new_base + "/" + $p
52-
elif (path | test(".*/index/metadata/.*")) then
53-
(path | sub(".*?/index/"; "")) as $p | new_base + "/" + $p
53+
if (path | test(".*/index/data/[^/]+$")) then
54+
(path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
55+
elif (path | test(".*/index/metadata/[^/]+$")) then
56+
(path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
5457
elif (path | test(".*/index$")) then
5558
new_base
5659
else
57-
if (path | test(".*/data/.*")) then
58-
(path | sub(".*?/data/"; "")) as $p | new_base + "/data/" + $p
59-
elif (path | test(".*/metadata/.*")) then
60-
(path | sub(".*?/metadata/"; "")) as $p | new_base + "/metadata/" + $p
60+
if (path | test(".*/metadata/[^/]+$")) then
61+
(path | sub(".*/metadata/"; "")) as $p | new_base + "/metadata/" + $p
62+
elif (path | test(".*/data/[^/]+$")) then
63+
(path | sub(".*/data/"; "")) as $p | new_base + "/data/" + $p
6164
else
6265
new_base
6366
end
6467
end;
6568
66-
.location = relativize_path(.location; "{new_base}") |
69+
.location = relativize_path(.location; "") |
6770
if .snapshots then
6871
.snapshots |= map(
6972
if .["manifest-list"] then
70-
.["manifest-list"] = relativize_path(.["manifest-list"]; "{new_base}")
73+
.["manifest-list"] = relativize_path(.["manifest-list"]; "")
7174
else . end
7275
)
7376
else . end |
7477
if .["metadata-log"] then
7578
.["metadata-log"] |= map(
7679
if .["metadata-file"] then
77-
.["metadata-file"] = relativize_path(.["metadata-file"]; "{new_base}")
80+
.["metadata-file"] = relativize_path(.["metadata-file"]; "")
7881
else . end
7982
)
8083
else . end
8184
'''
8285
with open(output_path, 'w') as out:
83-
returncode = subprocess.run(["jq", jq_filter, str(file_path)], stdout=out, check=True)
86+
subprocess.run(["jq", jq_filter, str(file_path)], stdout=out, check=True)
8487

8588

8689
def process_json_files(metadata_dir: Path, temp_metadata_dir: Path, new_base: str):
@@ -108,17 +111,17 @@ def process_avro_file(avro_file: Path, temp_dir: Path, temp_metadata_dir: Path,
108111
if filename.startswith("snap-"):
109112
jq_filter = f'''
110113
def relativize_path(path; new_base):
111-
if (path | test(".*/index/data/.*")) then
112-
(path | sub(".*?/index/"; "")) as $p | new_base + "/" + $p
113-
elif (path | test(".*/index/metadata/.*")) then
114-
(path | sub(".*?/index/"; "")) as $p | new_base + "/" + $p
114+
if (path | test(".*/index/data/[^/]+$")) then
115+
(path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
116+
elif (path | test(".*/index/metadata/[^/]+$")) then
117+
(path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
115118
elif (path | test(".*/index$")) then
116119
new_base
117120
else
118-
if (path | test(".*/data/.*")) then
119-
(path | sub(".*?/data/"; "")) as $p | new_base + "/data/" + $p
120-
elif (path | test(".*/metadata/.*")) then
121-
(path | sub(".*?/metadata/"; "")) as $p | new_base + "/metadata/" + $p
121+
if (path | test(".*/metadata/[^/]+$")) then
122+
(path | sub(".*/metadata/"; "")) as $p | new_base + "/metadata/" + $p
123+
elif (path | test(".*/data/[^/]+$")) then
124+
(path | sub(".*/data/"; "")) as $p | new_base + "/data/" + $p
122125
else
123126
new_base
124127
end
@@ -129,17 +132,17 @@ def relativize_path(path; new_base):
129132
else:
130133
jq_filter = f'''
131134
def relativize_path(path; new_base):
132-
if (path | test(".*/index/data/.*")) then
133-
(path | sub(".*?/index/"; "")) as $p | new_base + "/" + $p
134-
elif (path | test(".*/index/metadata/.*")) then
135-
(path | sub(".*?/index/"; "")) as $p | new_base + "/" + $p
135+
if (path | test(".*/index/data/[^/]+$")) then
136+
(path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
137+
elif (path | test(".*/index/metadata/[^/]+$")) then
138+
(path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
136139
elif (path | test(".*/index$")) then
137140
new_base
138141
else
139-
if (path | test(".*/data/.*")) then
140-
(path | sub(".*?/data/"; "")) as $p | new_base + "/data/" + $p
141-
elif (path | test(".*/metadata/.*")) then
142-
(path | sub(".*?/metadata/"; "")) as $p | new_base + "/metadata/" + $p
142+
if (path | test(".*/metadata/[^/]+$")) then
143+
(path | sub(".*/metadata/"; "")) as $p | new_base + "/metadata/" + $p
144+
elif (path | test(".*/data/[^/]+$")) then
145+
(path | sub(".*/data/"; "")) as $p | new_base + "/data/" + $p
143146
else
144147
new_base
145148
end

src/metadata/iceberg_table_metadata.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,16 +106,22 @@ static string GenerateMetaDataUrl(FileSystem &fs, const string &meta_path, strin
106106
if (options.metadata_compression_codec == "gzip") {
107107
compression_suffix = ".gz";
108108
}
109-
for (auto try_format : StringUtil::Split(options.version_name_format, ',')) {
109+
auto version_name_formats = StringUtil::Split(options.version_name_format, ',');
110+
vector<string> tried_paths;
111+
for (auto try_format : version_name_formats) {
110112
url = fs.JoinPath(meta_path, StringUtil::Format(try_format, table_version, compression_suffix));
113+
tried_paths.push_back(url);
111114
if (fs.FileExists(url)) {
112115
return url;
113116
}
114117
}
115118

116-
throw InvalidConfigurationException(
117-
"Iceberg metadata file not found for table version '%s' using '%s' compression and format(s): '%s'",
118-
table_version, options.metadata_compression_codec, options.version_name_format);
119+
string error;
120+
error = StringUtil::Format("Iceberg metadata file not found for table version '%s' using '%s' compression and "
121+
"format(s): '%s', tried paths:\n",
122+
table_version, options.metadata_compression_codec, options.version_name_format);
123+
error += StringUtil::Join(tried_paths, "\n");
124+
throw InvalidConfigurationException(error);
119125
}
120126

121127
string IcebergTableMetadata::GetTableVersionFromHint(const string &meta_path, FileSystem &fs,

0 commit comments

Comments
 (0)