Skip to content

Commit ba66e59

Browse files
committed
fix(ingest): preserve cloud workspace URIs in CollectionBuilder base class
CollectionBuilder.__init__ unconditionally wrapped workspace_dir in Path(), corrupting cloud URIs like s3://bucket/path into s3:/bucket/path. Keep cloud URIs (containing ://) as strings; continue using Path for local filesystem paths. This fixes the root cause at the base class level, complementing the call-site guard added in 1977f91.
1 parent 9231fc9 commit ba66e59

File tree

2 files changed

+31
-1
lines changed

2 files changed

+31
-1
lines changed

src/rasteret/ingest/base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,16 @@ def __init__(
4444
) -> None:
4545
self.name = name
4646
self.data_source = data_source
47-
self.workspace_dir = Path(workspace_dir) if workspace_dir else None
47+
if workspace_dir is None:
48+
self.workspace_dir: str | Path | None = None
49+
elif isinstance(workspace_dir, Path):
50+
self.workspace_dir = workspace_dir
51+
else:
52+
ws = str(workspace_dir)
53+
if "://" in ws and not ws.startswith("file://"):
54+
self.workspace_dir = ws
55+
else:
56+
self.workspace_dir = Path(ws)
4857

4958
@abstractmethod
5059
def build(self, **kwargs: Any) -> "Collection":

src/rasteret/tests/test_ingest.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,27 @@ def test_load_valid_manifest(self, tmp_path):
130130
assert isinstance(collection, Collection)
131131
assert collection.name == "manifest"
132132

133+
def test_workspace_dir_cloud_uri_not_mangled(self, tmp_path, monkeypatch):
134+
path = tmp_path / "manifest.parquet"
135+
_write_manifest(path)
136+
captured: dict[str, object] = {}
137+
138+
def _fake_build_collection_from_table(table: pa.Table, **kwargs):
139+
captured["workspace_dir"] = kwargs.get("workspace_dir")
140+
return Collection(dataset=ds.dataset(table), name="manifest")
141+
142+
monkeypatch.setattr(
143+
"rasteret.ingest.parquet_record_table.build_collection_from_table",
144+
_fake_build_collection_from_table,
145+
)
146+
147+
builder = RecordTableBuilder(
148+
path,
149+
workspace_dir="s3://demo-bucket/workspace_records",
150+
)
151+
builder.build()
152+
assert captured["workspace_dir"] == "s3://demo-bucket/workspace_records"
153+
133154
def test_column_remapping(self, tmp_path):
134155
table = pa.table(
135156
{

0 commit comments

Comments
 (0)