iterative
diff --git a/‎src/datachain/data_storage/warehouse.py‎
Lines changed: 34 additions & 18 deletions b/‎src/datachain/data_storage/warehouse.py‎
Lines changed: 34 additions & 18 deletions
diff --git a/‎src/datachain/query/batch.py‎
Lines changed: 1 addition & 2 deletions b/‎src/datachain/query/batch.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/datachain/query/dataset.py‎
Lines changed: 12 additions & 22 deletions b/‎src/datachain/query/dataset.py‎
Lines changed: 12 additions & 22 deletions
diff --git a/‎src/datachain/query/dispatch.py‎
Lines changed: 25 additions & 35 deletions b/‎src/datachain/query/dispatch.py‎
Lines changed: 25 additions & 35 deletions
diff --git a/‎src/datachain/query/utils.py‎
Lines changed: 0 additions & 38 deletions b/‎src/datachain/query/utils.py‎
Lines changed: 0 additions & 38 deletions
@@ -22,7 +22,6 @@
 from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
 from datachain.query.batch import RowsOutput
 from datachain.query.schema import ColumnMeta
-from datachain.query.utils import get_query_id_column
 from datachain.sql.functions import path as pathfunc
 from datachain.sql.types import Int, SQLType
 from datachain.utils import sql_escape_like
@@ -228,7 +227,8 @@ def dataset_select_paginated(
             while True:
                 if limit is not None:
                     limit -= num_yielded
-                    if limit == 0:
+                    num_yielded = 0
+                    if limit <= 0:
                         break
                     if limit < page_size:
                         paginated_query = paginated_query.limit(None).limit(limit)
@@ -246,32 +246,48 @@ def dataset_select_paginated(
                     break  # no more results
                 offset += page_size
 
-    def _regenerate_system_columns(self, selectable):
-        """Return a SELECT that regenerates sys__id and sys__rand deterministically."""
+    def _regenerate_system_columns(
+        self,
+        selectable: sa.Select | sa.CTE,
+        keep_existing_columns: bool = False,
+    ) -> sa.Select:
+        """
+        Return a SELECT that regenerates sys__id and sys__rand deterministically.
 
+        If keep_existing_columns is True, existing sys__id and sys__rand columns
+        will be kept as-is if they exist in the input selectable.
+        """
         base = selectable.subquery() if hasattr(selectable, "subquery") else selectable
 
+        result_columns: dict[str, sa.ColumnElement] = {}
+        for col in base.c:
+            if col.name in result_columns:
+                raise ValueError(f"Duplicate column name {col.name} in SELECT")
+            if col.name in ("sys__id", "sys__rand"):
+                if keep_existing_columns:
+                    result_columns[col.name] = col
+            else:
+                result_columns[col.name] = col
+
         system_types: dict[str, sa.types.TypeEngine] = {
             sys_col.name: sys_col.type
             for sys_col in self.schema.dataset_row_cls.sys_columns()
         }
 
-        result_columns = []
-        for col in base.c:
-            if col.name == "sys__id":
-                expr = self._system_row_number_expr()
-                expr = sa.cast(expr, system_types["sys__id"])
-                result_columns.append(expr.label("sys__id"))
-            elif col.name == "sys__rand":
-                expr = self._system_random_expr()
-                expr = sa.cast(expr, system_types["sys__rand"])
-                result_columns.append(expr.label("sys__rand"))
-            else:
-                result_columns.append(col)
+        # Add missing system columns if needed
+        if "sys__id" not in result_columns:
+            expr = self._system_row_number_expr()
+            expr = sa.cast(expr, system_types["sys__id"])
+            result_columns["sys__id"] = expr.label("sys__id")
+        if "sys__rand" not in result_columns:
+            expr = self._system_random_expr()
+            expr = sa.cast(expr, system_types["sys__rand"])
+            result_columns["sys__rand"] = expr.label("sys__rand")
 
         # Wrap in subquery to materialize window functions, then wrap again in SELECT
         # This ensures window functions are computed before INSERT...FROM SELECT
-        inner = sa.select(*result_columns).select_from(base).subquery()
+        columns = list(result_columns.values())
+        inner = sa.select(*columns).select_from(base).subquery()
         return sa.select(*inner.c).select_from(inner)
 
     def _system_row_number_expr(self):
@@ -380,7 +396,7 @@ def dataset_rows_select_from_ids(
         """
         Fetch dataset rows from database using a list of IDs.
         """
-        if (id_col := get_query_id_column(query)) is None:
+        if (id_col := query.selected_columns.get("sys__id")) is None:
             raise RuntimeError("sys__id column not found in query")
 
         query = query._clone().offset(None).limit(None).order_by(None)
 
@@ -6,7 +6,6 @@
 import sqlalchemy as sa
 
 from datachain.data_storage.schema import PARTITION_COLUMN_ID
-from datachain.query.utils import get_query_column
 
 RowsOutputBatch = Sequence[Sequence]
 RowsOutput = Sequence | RowsOutputBatch
@@ -106,7 +105,7 @@ def __call__(
         query: sa.Select,
         id_col: sa.ColumnElement | None = None,
     ) -> Generator[RowsOutput, None, None]:
-        if (partition_col := get_query_column(query, PARTITION_COLUMN_ID)) is None:
+        if (partition_col := query.selected_columns.get(PARTITION_COLUMN_ID)) is None:
             raise RuntimeError("partition column not found in query")
 
         ids_only = False
 
@@ -438,6 +438,9 @@ def create_result_query(
         """
 
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
+        if "sys__id" not in query.selected_columns:
+            raise RuntimeError("Query must have sys__id column to run UDF")
+
         if (rows_total := self.catalog.warehouse.query_count(query)) == 0:
             return
 
@@ -580,13 +583,10 @@ def create_partitions_table(self, query: Select) -> "Table":
         """
         Create temporary table with group by partitions.
         """
-        # Check if partition_by is set, we need it to create partitions.
-        assert self.partition_by is not None
-        # Check if sys__id is in the query, we need it to be able to join
-        # the partition table with the udf table later.
-        assert any(c.name == "sys__id" for c in query.selected_columns), (
-            "Query must have sys__id column to use partitioning."
-        )
+        if self.partition_by is None:
+            raise RuntimeError("Query must have partition_by set to use partitioning")
+        if (id_col := query.selected_columns.get("sys__id")) is None:
+            raise RuntimeError("Query must have sys__id column to use partitioning")
 
         if isinstance(self.partition_by, (list, tuple, GeneratorType)):
             list_partition_by = list(self.partition_by)
@@ -602,7 +602,7 @@ def create_partitions_table(self, query: Select) -> "Table":
 
         # fill table with partitions
         cols = [
-            query.selected_columns.sys__id,
+            id_col,
             f.dense_rank().over(order_by=partition_by).label(PARTITION_COLUMN_ID),
         ]
         self.catalog.warehouse.db.execute(
@@ -634,21 +634,11 @@ def apply(
 
         # Apply partitioning if needed.
         if self.partition_by is not None:
-            if not any(c.name == "sys__id" for c in query.selected_columns):
-                # If sys__id is not in the query, we need to create a temp table
-                # to hold the query results, so we can join it with the
-                # partition table later.
-                columns = [
-                    c if isinstance(c, Column) else Column(c.name, c.type)
-                    for c in query.subquery().columns
-                ]
-                temp_table = self.catalog.warehouse.create_dataset_rows_table(
-                    self.catalog.warehouse.temp_table_name(),
-                    columns=columns,
+            if "sys__id" not in query.selected_columns:
+                _query = query = self.catalog.warehouse._regenerate_system_columns(
+                    query,
+                    keep_existing_columns=True,
                 )
-                temp_tables.append(temp_table.name)
-                self.catalog.warehouse.copy_table(temp_table, query)
-                _query = query = temp_table.select()
 
             partition_tbl = self.create_partitions_table(query)
             temp_tables.append(partition_tbl.name)
 
@@ -22,7 +22,6 @@
 )
 from datachain.query.queue import get_from_queue, put_into_queue
 from datachain.query.udf import UdfInfo
-from datachain.query.utils import get_query_id_column
 from datachain.utils import batched, flatten, safe_closing
 
 if TYPE_CHECKING:
@@ -55,6 +54,9 @@ def udf_entrypoint() -> int:
     udf_info: UdfInfo = load(stdin.buffer)
 
     query = udf_info["query"]
+    if "sys__id" not in query.selected_columns:
+        raise RuntimeError("sys__id column is required in UDF query")
+
     batching = udf_info["batching"]
     is_generator = udf_info["is_generator"]
 
@@ -65,15 +67,16 @@ def udf_entrypoint() -> int:
     wh_cls, wh_args, wh_kwargs = udf_info["warehouse_clone_params"]
     warehouse: AbstractWarehouse = wh_cls(*wh_args, **wh_kwargs)
 
-    id_col = get_query_id_column(query)
-
     with contextlib.closing(
-        batching(warehouse.dataset_select_paginated, query, id_col=id_col)
+        batching(
+            warehouse.dataset_select_paginated,
+            query,
+            id_col=query.selected_columns.sys__id,
+        )
     ) as udf_inputs:
         try:
             UDFDispatcher(udf_info).run_udf(
                 udf_inputs,
-                ids_only=id_col is not None,
                 download_cb=download_cb,
                 processed_cb=processed_cb,
                 generated_cb=generated_cb,
@@ -147,10 +150,10 @@ def _create_worker(self) -> "UDFWorker":
             self.udf_fields,
         )
 
-    def _run_worker(self, ids_only: bool) -> None:
+    def _run_worker(self) -> None:
         try:
             worker = self._create_worker()
-            worker.run(ids_only)
+            worker.run()
         except (Exception, KeyboardInterrupt) as e:
             if self.done_queue:
                 put_into_queue(
@@ -164,7 +167,6 @@ def _run_worker(self, ids_only: bool) -> None:
     def run_udf(
         self,
         input_rows: Iterable["RowsOutput"],
-        ids_only: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
         generated_cb: Callback = DEFAULT_CALLBACK,
@@ -178,9 +180,7 @@ def run_udf(
 
         if n_workers == 1:
             # no need to spawn worker processes if we are running in a single process
-            self.run_udf_single(
-                input_rows, ids_only, download_cb, processed_cb, generated_cb
-            )
+            self.run_udf_single(input_rows, download_cb, processed_cb, generated_cb)
         else:
             if self.buffer_size < n_workers:
                 raise RuntimeError(
@@ -189,13 +189,12 @@ def run_udf(
                 )
 
             self.run_udf_parallel(
-                n_workers, input_rows, ids_only, download_cb, processed_cb, generated_cb
+                n_workers, input_rows, download_cb, processed_cb, generated_cb
             )
 
     def run_udf_single(
         self,
         input_rows: Iterable["RowsOutput"],
-        ids_only: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
         generated_cb: Callback = DEFAULT_CALLBACK,
@@ -204,18 +203,15 @@ def run_udf_single(
         # Rebuild schemas in single process too for consistency (cheap, idempotent).
         ModelStore.rebuild_all()
 
-        if ids_only and not self.is_batching:
+        if not self.is_batching:
             input_rows = flatten(input_rows)
 
         def get_inputs() -> Iterable["RowsOutput"]:
             warehouse = self.catalog.warehouse.clone()
-            if ids_only:
-                for ids in batched(input_rows, DEFAULT_BATCH_SIZE):
-                    yield from warehouse.dataset_rows_select_from_ids(
-                        self.query, ids, self.is_batching
-                    )
-            else:
-                yield from input_rows
+            for ids in batched(input_rows, DEFAULT_BATCH_SIZE):
+                yield from warehouse.dataset_rows_select_from_ids(
+                    self.query, ids, self.is_batching
+                )
 
         prefetch = udf.prefetch
         with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
@@ -249,7 +245,6 @@ def run_udf_parallel(  # noqa: C901, PLR0912
         self,
         n_workers: int,
         input_rows: Iterable["RowsOutput"],
-        ids_only: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
         generated_cb: Callback = DEFAULT_CALLBACK,
@@ -258,9 +253,7 @@ def run_udf_parallel(  # noqa: C901, PLR0912
         self.done_queue = self.ctx.Queue()
 
         pool = [
-            self.ctx.Process(
-                name=f"Worker-UDF-{i}", target=self._run_worker, args=[ids_only]
-            )
+            self.ctx.Process(name=f"Worker-UDF-{i}", target=self._run_worker)
             for i in range(n_workers)
         ]
         for p in pool:
@@ -406,13 +399,13 @@ def __init__(
         self.processed_cb = ProcessedCallback("processed", self.done_queue)
         self.generated_cb = ProcessedCallback("generated", self.done_queue)
 
-    def run(self, ids_only: bool) -> None:
+    def run(self) -> None:
         prefetch = self.udf.prefetch
         with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
             catalog = clone_catalog_with_cache(self.catalog, _cache)
             udf_results = self.udf.run(
                 self.udf_fields,
-                self.get_inputs(ids_only),
+                self.get_inputs(),
                 catalog,
                 self.cache,
                 download_cb=self.download_cb,
@@ -434,13 +427,10 @@ def notify_and_process(self, udf_results):
             put_into_queue(self.done_queue, {"status": OK_STATUS})
             yield row
 
-    def get_inputs(self, ids_only: bool) -> Iterable["RowsOutput"]:
+    def get_inputs(self) -> Iterable["RowsOutput"]:
         warehouse = self.catalog.warehouse.clone()
         while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
-            if ids_only:
-                for ids in batched(batch, DEFAULT_BATCH_SIZE):
-                    yield from warehouse.dataset_rows_select_from_ids(
-                        self.query, ids, self.is_batching
-                    )
-            else:
-                yield from batch
+            for ids in batched(batch, DEFAULT_BATCH_SIZE):
+                yield from warehouse.dataset_rows_select_from_ids(
+                    self.query, ids, self.is_batching
+                )