Merge pull request #33 from atomic-data-sciences/enhancement/windows_integration

munrojm · web-flow · commit a901dbe0ba27 · 2025-05-29T23:07:28.000-04:00
Add robustness to Windows usage
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,6 +50,7 @@ dev = [
   "pytest-httpserver",
   "pytest-cov",
   "pytest-order",
+  "pytest-dependency",
   "ruff",
   "mypy",
   "types-requests",
diff --git a/src/atomicds/client.py b/src/atomicds/client.py
@@ -13,7 +13,7 @@
 from pycocotools import mask as mask_util
 
 from atomicds.core import BaseClient, ClientError, _FileSlice
-from atomicds.core.utils import _make_progress
+from atomicds.core.utils import _make_progress, normalize_path
 from atomicds.results import RHEEDImageResult, RHEEDVideoResult, XPSResult
 
 
@@ -345,7 +345,7 @@ def upload(self, files: list[str | BinaryIO]):
         file_data = []
         for file in files:
             if isinstance(file, str):
-                path = Path(file)
+                path = normalize_path(file)
                 if not (path.exists() and path.is_file()):
                     raise ClientError(f"{path} is not a file or does not exist")
 
@@ -363,11 +363,18 @@ def upload(self, files: list[str | BinaryIO]):
                 file_name = file.name
 
             file_data.append(
-                {"num_urls": num_urls, "file_name": file_name, "file_size": file_size}
+                {
+                    "num_urls": num_urls,
+                    "file_name": file_name,
+                    "file_size": file_size,
+                    "file_path": file,
+                }
             )
 
         def __upload_file(
-            file_info: dict[Literal["num_urls", "file_name", "file_size"], int | str],
+            file_info: dict[
+                Literal["num_urls", "file_name", "file_size", "file_path"], int | str
+            ],
         ):
             url_data: list[dict[str, str | int]] = self._post_or_put(
                 method="POST",
@@ -392,7 +399,7 @@ def __upload_file(
                         "sub_url": "",
                         "params": None,
                         "base_override": part["url"],
-                        "file_name": file_info["file_name"],
+                        "file_path": file_info["file_path"],
                         "offset": offset,
                         "length": length,
                     }
@@ -403,11 +410,11 @@ def __upload_chunk(
                 sub_url: str,
                 params: dict[str, Any] | None,
                 base_override: str,
-                file_name: str,
+                file_path: Path,
                 offset: int,
                 length: int,
             ) -> Any:
-                slice_obj = _FileSlice(file_name, offset, length)
+                slice_obj = _FileSlice(file_path, offset, length)
                 return self._post_or_put(
                     method=method,
                     sub_url=sub_url,
@@ -435,21 +442,24 @@ def __upload_chunk(
                 transient=True,
             )
 
-            # Confirm file upload
-            etag_body = [
-                {"ETag": entry["ETag"], "PartNumber": i + 1}
-                for i, entry in enumerate(etag_data)
-            ]
-            self._post_or_put(
-                method="POST",
-                sub_url="data_entries/raw_data/staged/upload_urls/complete/",
-                params={"staging_type": "core"},
-                body={
-                    "upload_id": url_data[0]["upload_id"],
-                    "new_filename": url_data[0]["new_filename"],
-                    "etag_data": etag_body,
-                },
-            )
+            # Complete multipart upload *only* if the backend issued an upload_id
+            first_part = url_data[0]
+            upload_id = first_part.get("upload_id")
+            if upload_id:
+                etag_body = [
+                    {"ETag": entry["ETag"], "PartNumber": i + 1}
+                    for i, entry in enumerate(etag_data)
+                ]
+                self._post_or_put(
+                    method="POST",
+                    sub_url="data_entries/raw_data/staged/upload_urls/complete/",
+                    params={"staging_type": "core"},
+                    body={
+                        "upload_id": upload_id,
+                        "new_filename": first_part["new_filename"],
+                        "etag_data": etag_body,
+                    },
+                )
 
         main_task = None
         file_count = len(file_data)
diff --git a/src/atomicds/core/utils.py b/src/atomicds/core/utils.py
@@ -1,3 +1,8 @@
+import os
+import re
+import unicodedata
+from pathlib import Path
+
 import networkx as nx
 import numpy as np
 import numpy.typing as npt
@@ -224,3 +229,46 @@ def render(self, task) -> Text:
         transient=transient,
         refresh_per_second=30,
     )
+
+
+def normalize_path(path_str: str) -> Path:
+    """Normalize a file path string for use with pathlib.
+
+    This will:
+      1. Remove control characters and convert “smart” quotes into plain quotes.
+      2. Strip leading/trailing whitespace and any surrounding quotes.
+      3. Expand user (~) and environment variables.
+      4. Normalize Unicode, unify separators, and collapse “..”/“.” segments.
+
+    Args:
+        path_str: Raw path string copied from Windows (may contain spaces,
+                  smart quotes, stray control chars, etc.)
+
+    Returns:
+        A pathlib.Path pointing to the normalized path.
+    """
+    # 1. Drop control characters
+    filtered = "".join(ch for ch in path_str if unicodedata.category(ch)[0] != "C")
+
+    # 2. Convert smart quotes to plain ones
+    smart_quotes = {"\u201c": '"', "\u201d": '"', "\u2018": "'", "\u2019": "'"}
+    for smart, plain in smart_quotes.items():
+        filtered = filtered.replace(smart, plain)
+
+    # 3. Trim whitespace and surrounding quotes
+    filtered = filtered.strip()
+    m = re.match(r'^[\'"](.*)[\'"]$', filtered)
+    if m:
+        filtered = m.group(1)
+
+    # 4. Expand ~ and env vars
+    expanded = os.path.expanduser(os.path.expandvars(filtered))  # noqa: PTH111
+
+    # 5. Normalize Unicode and separators
+    normalized_unicode = unicodedata.normalize("NFC", expanded)
+    unified_sep = normalized_unicode.replace("/", os.sep)
+
+    # 6. Collapse redundant segments
+    final_path = os.path.normpath(unified_sep)
+
+    return Path(final_path)
diff --git a/tests/data/test_rheed.mp4 b/tests/data/test_rheed.mp4
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -2,9 +2,10 @@
 import pytest
 from atomicds import Client
 from datetime import datetime
+from pathlib import Path
 from unittest import mock
+from urllib.parse import urljoin
 from .conftest import ResultIDs
-from atomicds.results import RHEEDVideoResult
 
 
 @pytest.fixture
@@ -113,3 +114,36 @@ def test_get(client: Client):
     data_types = set([type(result) for result in results])
 
     assert len(data_types) == 3
+
+
+# @pytest.mark.order(2)
+# @pytest.mark.dependency(name="upload", dependds=["get"])
+# def test_upload(client: Client):
+#     test_video = str(Path(__file__).parent.absolute()) + "/data/test_rheed.mp4"
+#     client.upload(files=[test_video])
+#
+#
+# @pytest.mark.order(3)
+# @pytest.mark.dependency(depends=["upload"])
+# def test_download(client: Client):
+#     # Get data IDs from uploaded test files
+#     data = client.search(keywords=["test_rheed"], include_organization_data=False)
+#     assert len(data["Data ID"].values)
+#
+#     data_ids = list(data["Data ID"].values)
+#     client.download_videos(data_ids=data_ids, dest_dir="./")
+#
+#     # Cleanup downloaded files
+#     for data_id in data_ids:
+#         file_path = Path("./") / f"{data_id}.mp4"
+#         if file_path.exists():
+#             file_path.unlink()
+#
+#     response = client.session.delete(
+#         url=urljoin(client.endpoint, "/data_entries"),
+#         verify=True,
+#         params={"data_ids": data_ids},
+#     )
+#     assert (
+#         response.ok
+#     ), f"Failed to delete data entries: {response.status_code} - {response.text}"