datachain-ai
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/datachain/catalog/catalog.py‎
Lines changed: 235 additions & 185 deletions b/‎src/datachain/catalog/catalog.py‎
Lines changed: 235 additions & 185 deletions
diff --git a/‎src/datachain/data_storage/metastore.py‎
Lines changed: 71 additions & 24 deletions b/‎src/datachain/data_storage/metastore.py‎
Lines changed: 71 additions & 24 deletions
diff --git a/‎src/datachain/data_storage/sqlite.py‎
Lines changed: 30 additions & 7 deletions b/‎src/datachain/data_storage/sqlite.py‎
Lines changed: 30 additions & 7 deletions
diff --git a/‎src/datachain/data_storage/warehouse.py‎
Lines changed: 8 additions & 4 deletions b/‎src/datachain/data_storage/warehouse.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/datachain/utils.py‎
Lines changed: 85 additions & 0 deletions b/‎src/datachain/utils.py‎
Lines changed: 85 additions & 0 deletions
@@ -51,6 +51,7 @@ dependencies = [
   "huggingface_hub",
   "iterative-telemetry>=0.0.10",
   "platformdirs",
+  "filelock",
   "dvc-studio-client>=0.21,<1",
   "tabulate",
   "websockets",
 
@@ -5,7 +5,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
 from contextlib import contextmanager, nullcontext, suppress
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
 from functools import cached_property, reduce
 from itertools import groupby
 from typing import TYPE_CHECKING, Any
@@ -32,6 +32,7 @@
     cast,
     desc,
     literal,
+    or_,
     select,
 )
 from sqlalchemy.sql import func as f
@@ -71,6 +72,11 @@
 from datachain.namespace import Namespace
 from datachain.project import Project
 
+# Versions with no job_id (e.g. from pull_dataset) are only eligible
+# for gc cleanup if they are older than this threshold, to avoid
+# cleaning up in-flight operations.
+STALE_CREATED_THRESHOLD_HOURS = 1
+
 if TYPE_CHECKING:
     from sqlalchemy import CTE, Delete, Insert, Select, Subquery, Update
     from sqlalchemy.schema import SchemaItem
@@ -335,20 +341,18 @@ def get_incomplete_dataset_versions(
         self, job_id: str | None = None
     ) -> list[tuple[DatasetRecord, str]]:
         """
-        Get failed/incomplete dataset versions that are in complete job. This is
-        used to get versions to cleanup.
+        Get incomplete dataset versions to clean up.
 
-        Returns dataset versions that:
-        - Have status CREATED or FAILED (incomplete/failed)
-        - Belong to jobs that are not running (COMPLETE, FAILED, CANCELED)
+        When job_id is provided, returns versions belonging to that specific
+        job (used during job failure cleanup).
 
-        Cleans both CREATED and FAILED to handle edge cases:
-        - FAILED: Explicitly marked failed versions
-        - CREATED: Orphaned versions from crashes/bugs (before failure marking)
+        When job_id is None, returns all incomplete dataset versions
+        whose associated job is finished, plus versions with no job_id
+        that are older than STALE_CREATED_THRESHOLD_HOURS (used by gc).
 
         Returns:
             List of (DatasetRecord, version_string) tuples. Each DatasetRecord
-            contains only one version (the failed version to clean).
+            contains only one version (the incomplete version to clean).
         """
 
     @abstractmethod
@@ -373,6 +377,14 @@ def list_datasets_by_prefix(
         projects.
         """
 
+    def get_dataset_by_version_uuid(
+        self,
+        uuid: str,
+        include_incomplete: bool = False,
+    ) -> DatasetRecord:
+        """Gets a dataset that contains a version with the given UUID."""
+        raise NotImplementedError
+
     @abstractmethod
     def get_dataset(
         self,
@@ -1540,6 +1552,20 @@ def list_datasets_by_prefix(
         query = query.where(self._datasets.c.name.startswith(prefix))
         yield from self._parse_dataset_list(self.db.execute(query))
 
+    def get_dataset_by_version_uuid(
+        self,
+        uuid: str,
+        include_incomplete: bool = False,
+    ) -> DatasetRecord:
+        """Gets a dataset that contains a version with the given UUID."""
+        dv = self._datasets_versions
+        query = self._base_dataset_query(include_incomplete=include_incomplete)
+        query = query.where(dv.c.uuid == uuid)
+        ds = self._parse_dataset(self.db.execute(query))
+        if not ds:
+            raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
+        return ds
+
     def get_dataset(
         self,
         name: str,  # normal, not full dataset name
@@ -1613,29 +1639,50 @@ def get_incomplete_dataset_versions(
         dv = self._datasets_versions
         j = self._jobs
 
-        # Query dataset + version info for failed versions from non-running jobs
+        select_cols = (
+            *(getattr(n.c, f) for f in self._namespaces_fields),
+            *(getattr(p.c, f) for f in self._projects_fields),
+            *(getattr(d.c, f) for f in self._dataset_fields),
+            *(getattr(dv.c, f) for f in self._dataset_version_fields),
+        )
+        base_from = (
+            n.join(p, n.c.id == p.c.namespace_id)
+            .join(d, p.c.id == d.c.project_id)
+            .join(dv, d.c.id == dv.c.dataset_id)
+        )
+
+        # LEFT JOIN on jobs so versions with job_id=NULL are included.
+        # Only skip versions whose job is still running.
         query = (
-            self._datasets_select(
-                *(getattr(n.c, f) for f in self._namespaces_fields),
-                *(getattr(p.c, f) for f in self._projects_fields),
-                *(getattr(d.c, f) for f in self._dataset_fields),
-                *(getattr(dv.c, f) for f in self._dataset_version_fields),
-            )
+            self._datasets_select(*select_cols)
             .select_from(
-                n.join(p, n.c.id == p.c.namespace_id)
-                .join(d, p.c.id == d.c.project_id)
-                .join(dv, d.c.id == dv.c.dataset_id)
-                .join(j, cast(dv.c.job_id, j.c.id.type) == j.c.id)
+                base_from.join(
+                    j,
+                    cast(dv.c.job_id, j.c.id.type) == j.c.id,
+                    isouter=True,
+                )
             )
             .where(
                 dv.c.status.in_([DatasetStatus.CREATED, DatasetStatus.FAILED]),
-                j.c.status.in_(
-                    [JobStatus.COMPLETE, JobStatus.FAILED, JobStatus.CANCELED]
+                or_(
+                    # job is finished
+                    j.c.status.in_(
+                        [JobStatus.COMPLETE, JobStatus.FAILED, JobStatus.CANCELED]
+                    ),
+                    # or no job at all (e.g. pull_dataset) — but only
+                    # if old enough to not be an in-flight operation
+                    and_(
+                        dv.c.job_id.is_(None),
+                        dv.c.created_at
+                        < datetime.now(timezone.utc)
+                        - timedelta(hours=STALE_CREATED_THRESHOLD_HOURS),
+                    ),
                 ),
             )
         )
+
         if job_id:
-            query = query.where(j.c.id == job_id)
+            query = query.where(dv.c.job_id == job_id)
 
         # Parse results and return (dataset, version) tuples
         results = []
 
@@ -85,6 +85,21 @@ def get_retry_sleep_sec(retry_count: int) -> int:
     return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
 
 
+SQLITE_BUSY = 5
+SQLITE_LOCKED = 6
+
+
+def _is_sqlite_lock_error(exc: sqlite3.OperationalError) -> bool:
+    """Return True if the OperationalError is a transient lock/busy error."""
+    code = getattr(exc, "sqlite_errorcode", None)
+    if code is not None:
+        # Python >=3.11: use the precise error code
+        return code in (SQLITE_BUSY, SQLITE_LOCKED)
+    # Python 3.10: fall back to message matching
+    msg = str(exc).lower()
+    return "locked" in msg or "busy" in msg
+
+
 def retry_sqlite_locks(func):
     # This retries the database modification in case of concurrent access
     @wraps(func)
@@ -94,6 +109,8 @@ def wrapper(*args, **kwargs):
             try:
                 return func(*args, **kwargs)
             except sqlite3.OperationalError as operror:
+                if not _is_sqlite_lock_error(operror):
+                    raise
                 exc = operror
                 sleep(get_retry_sleep_sec(retry_count))
         raise exc
@@ -158,14 +175,24 @@ def _connect(
             # ensure we run SA on_connect init (e.g it registers regexp function),
             # also makes sure that it's consistent. Otherwise in some cases it
             # seems we are getting different results if engine object is used in a
-            # different thread first and enine is not used in the Main thread.
+            # different thread first and engine is not used in the Main thread.
             engine.connect().close()
 
             db.isolation_level = None  # Use autocommit mode
             db.execute("PRAGMA foreign_keys = ON")
             db.execute("PRAGMA cache_size = -102400")  # 100 MiB
-            # Enable Write-Ahead Log Journaling
-            db.execute("PRAGMA journal_mode = WAL")
+            # Switching to WAL requires an exclusive lock, so retry briefly
+            # in case another process is initializing the same DB file.
+            for _ in range(5):
+                try:
+                    db.execute("PRAGMA journal_mode = WAL")
+                    break
+                except sqlite3.OperationalError as e:
+                    if not _is_sqlite_lock_error(e):
+                        raise
+                    sleep(1)
+            else:
+                db.execute("PRAGMA journal_mode = WAL")  # final attempt, let it raise
             db.execute("PRAGMA synchronous = NORMAL")
             db.execute("PRAGMA case_sensitive_like = ON")
 
@@ -847,10 +874,6 @@ def get_buffer(
             )
         return self.buffers[table.name]
 
-    def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
-        dr = self.dataset_rows(dataset, version)
-        return self.db.insert_dataframe(dr.table.name, df)
-
     def instr(self, source, target) -> "ColumnElement":
         return cast(func.instr(source, target), sqlalchemy.Boolean)
 
 
@@ -391,6 +391,14 @@ def create_dataset_rows_table(
     ) -> sa.Table:
         """Creates a dataset rows table for the given dataset name and columns"""
 
+    def insert_dataframe_to_table(self, table_name: str, df) -> int:
+        """
+        Insert dataframe into any table by name.
+
+        This is used for inserting data into temporary staging tables.
+        """
+        return self.db.insert_dataframe(table_name, df)
+
     def drop_dataset_rows_table(
         self,
         dataset: DatasetRecord,
@@ -538,10 +546,6 @@ def insert_rows_done(self, table: sa.Table) -> None:
         """Signal that row inserts are complete by flushing and closing the buffer."""
         self.close_buffer(table)
 
-    @abstractmethod
-    def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
-        """Inserts dataset rows directly into dataset table"""
-
     @abstractmethod
     def instr(self, source, target) -> sa.ColumnElement:
         """
 
@@ -122,6 +122,91 @@ def find(cls, create: bool = True) -> "Self":
         return instance
 
 
+@contextmanager
+def interprocess_file_lock(
+    lock_path: str,
+    *,
+    wait_message: str | None = None,
+    timeout: float = -1,
+) -> Iterator[None]:
+    """Acquire an inter-process lock backed by a file.
+
+    Intended for local-only concurrency control (multiple CLI processes sharing
+    the same DataChainDir). Locks are released automatically by the OS when the
+    process exits, including on SIGKILL.
+
+    Uses `filelock.FileLock` (OS-level file locking).
+    """
+
+    from filelock import FileLock, Timeout
+
+    lock_dir = osp.dirname(lock_path)
+    if lock_dir:
+        os.makedirs(lock_dir, exist_ok=True)
+    lock = FileLock(lock_path)
+    pid_path = f"{lock_path}.pid"
+
+    def _read_pid() -> int | None:
+        try:
+            with open(pid_path, encoding="utf-8") as f:
+                raw = f.read().strip()
+            return int(raw) if raw else None
+        except Exception:  # noqa: BLE001
+            return None
+
+    def _write_pid() -> None:
+        try:
+            with open(pid_path, "w", encoding="utf-8") as f:
+                f.write(str(os.getpid()))
+        except Exception:
+            logger.debug(
+                "Failed to write PID into lock file %s",
+                pid_path,
+                exc_info=True,
+            )
+
+    def _print_wait_hint(pid: int | None) -> None:
+        if not wait_message:
+            return
+        pid_str = f" (pid={pid})" if pid is not None else ""
+        if pid is not None:
+            check_hint = (
+                f"If this looks stuck, first check the PID is running "
+                f"(e.g. `ps -p {pid}`), then if you are sure no process is "
+                f"running delete: {lock_path} (and {pid_path})"
+            )
+        else:
+            check_hint = f"If this looks stuck, delete: {lock_path} (and {pid_path})"
+        print(f"{wait_message}{pid_str}\n{check_hint}")
+
+    acquired = False
+    try:
+        if wait_message:
+            try:
+                lock.acquire(timeout=0)
+            except Timeout:
+                _print_wait_hint(_read_pid())
+                lock.acquire(timeout=timeout)
+        else:
+            lock.acquire(timeout=timeout)
+
+        acquired = True
+        _write_pid()
+        yield
+    finally:
+        if acquired:
+            try:
+                os.remove(pid_path)
+            except OSError:
+                logger.debug(
+                    "Failed to remove PID file %s during lock cleanup",
+                    pid_path,
+                    exc_info=True,
+                )
+            finally:
+                lock.release()
+
+
 @dataclass
 class DatasetIdentifier:
     namespace: str