66import shutil
77import tempfile
88import subprocess
9+ import re
910from datetime import datetime
10- from pathlib import Path
11+ from pathlib import Path , PurePosixPath
1112
1213
1314def parse_args ():
@@ -22,10 +23,12 @@ def abort(msg):
2223
2324
2425def relativize_path (abs_path : str , new_base : str ) -> str :
25- if "/data/" in abs_path :
26- return os .path .join (new_base , "data" , abs_path .split ("/data/" , 1 )[1 ])
27- elif "/metadata/" in abs_path :
28- return os .path .join (new_base , "metadata" , abs_path .split ("/metadata/" , 1 )[1 ])
26+ p = PurePosixPath (abs_path )
27+ parent = p .parent .name
28+ if parent in ("data" , "metadata" ):
29+ return os .path .join (new_base , parent , p .name )
30+ elif p .parent .parent .name == "index" and parent in ("data" , "metadata" ):
31+ return os .path .join (new_base , parent , p .name )
2932 else :
3033 return new_base
3134
@@ -47,40 +50,40 @@ def validate_paths(table_path: Path):
4750def jq_rewrite_json (file_path : Path , output_path : Path , new_base : str ):
4851 jq_filter = f'''
4952 def relativize_path(path; new_base):
50- if (path | test(".*/index/data/.* ")) then
51- (path | sub(".*? /index/"; "")) as $p | new_base + "/" + $p
52- elif (path | test(".*/index/metadata/.* ")) then
53- (path | sub(".*? /index/"; "")) as $p | new_base + "/" + $p
53+ if (path | test(".*/index/data/[^/]+$ ")) then
54+ (path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
55+ elif (path | test(".*/index/metadata/[^/]+$ ")) then
56+ (path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
5457 elif (path | test(".*/index$")) then
5558 new_base
5659 else
57- if (path | test(".*/data/.* ")) then
58- (path | sub(".*?/data /"; "")) as $p | new_base + "/data /" + $p
59- elif (path | test(".*/metadata/.* ")) then
60- (path | sub(".*?/metadata /"; "")) as $p | new_base + "/metadata /" + $p
60+ if (path | test(".*/metadata/[^/]+$ ")) then
61+ (path | sub(".*/metadata /"; "")) as $p | new_base + "/metadata /" + $p
62+ elif (path | test(".*/data/[^/]+$ ")) then
63+ (path | sub(".*/data /"; "")) as $p | new_base + "/data /" + $p
6164 else
6265 new_base
6366 end
6467 end;
6568
66- .location = relativize_path(.location; "{ new_base } ") |
69+ .location = relativize_path(.location; "") |
6770 if .snapshots then
6871 .snapshots |= map(
6972 if .["manifest-list"] then
70- .["manifest-list"] = relativize_path(.["manifest-list"]; "{ new_base } ")
73+ .["manifest-list"] = relativize_path(.["manifest-list"]; "")
7174 else . end
7275 )
7376 else . end |
7477 if .["metadata-log"] then
7578 .["metadata-log"] |= map(
7679 if .["metadata-file"] then
77- .["metadata-file"] = relativize_path(.["metadata-file"]; "{ new_base } ")
80+ .["metadata-file"] = relativize_path(.["metadata-file"]; "")
7881 else . end
7982 )
8083 else . end
8184 '''
8285 with open (output_path , 'w' ) as out :
83- returncode = subprocess .run (["jq" , jq_filter , str (file_path )], stdout = out , check = True )
86+ subprocess .run (["jq" , jq_filter , str (file_path )], stdout = out , check = True )
8487
8588
8689def process_json_files (metadata_dir : Path , temp_metadata_dir : Path , new_base : str ):
@@ -108,17 +111,17 @@ def process_avro_file(avro_file: Path, temp_dir: Path, temp_metadata_dir: Path,
108111 if filename .startswith ("snap-" ):
109112 jq_filter = f'''
110113 def relativize_path(path; new_base):
111- if (path | test(".*/index/data/.* ")) then
112- (path | sub(".*? /index/"; "")) as $p | new_base + "/" + $p
113- elif (path | test(".*/index/metadata/.* ")) then
114- (path | sub(".*? /index/"; "")) as $p | new_base + "/" + $p
114+ if (path | test(".*/index/data/[^/]+$ ")) then
115+ (path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
116+ elif (path | test(".*/index/metadata/[^/]+$ ")) then
117+ (path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
115118 elif (path | test(".*/index$")) then
116119 new_base
117120 else
118- if (path | test(".*/data/.* ")) then
119- (path | sub(".*?/data /"; "")) as $p | new_base + "/data /" + $p
120- elif (path | test(".*/metadata/.* ")) then
121- (path | sub(".*?/metadata /"; "")) as $p | new_base + "/metadata /" + $p
121+ if (path | test(".*/metadata/[^/]+$ ")) then
122+ (path | sub(".*/metadata /"; "")) as $p | new_base + "/metadata /" + $p
123+ elif (path | test(".*/data/[^/]+$ ")) then
124+ (path | sub(".*/data /"; "")) as $p | new_base + "/data /" + $p
122125 else
123126 new_base
124127 end
@@ -129,17 +132,17 @@ def relativize_path(path; new_base):
129132 else :
130133 jq_filter = f'''
131134 def relativize_path(path; new_base):
132- if (path | test(".*/index/data/.* ")) then
133- (path | sub(".*? /index/"; "")) as $p | new_base + "/" + $p
134- elif (path | test(".*/index/metadata/.* ")) then
135- (path | sub(".*? /index/"; "")) as $p | new_base + "/" + $p
135+ if (path | test(".*/index/data/[^/]+$ ")) then
136+ (path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
137+ elif (path | test(".*/index/metadata/[^/]+$ ")) then
138+ (path | sub(".*/index/"; "")) as $p | new_base + "/" + $p
136139 elif (path | test(".*/index$")) then
137140 new_base
138141 else
139- if (path | test(".*/data/.* ")) then
140- (path | sub(".*?/data /"; "")) as $p | new_base + "/data /" + $p
141- elif (path | test(".*/metadata/.* ")) then
142- (path | sub(".*?/metadata /"; "")) as $p | new_base + "/metadata /" + $p
142+ if (path | test(".*/metadata/[^/]+$ ")) then
143+ (path | sub(".*/metadata /"; "")) as $p | new_base + "/metadata /" + $p
144+ elif (path | test(".*/data/[^/]+$ ")) then
145+ (path | sub(".*/data /"; "")) as $p | new_base + "/data /" + $p
143146 else
144147 new_base
145148 end
0 commit comments