fix(gen): always prepare table for now to have sysys columns (#1428)

shcheklein · web-flow · commit afecf1ae658f · 2025-10-27T12:46:47.000-07:00
diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py
@@ -426,8 +426,10 @@ def create_udf_table(self, query: Select) -> "Table":
         """Method that creates a table where temp udf results will be saved"""
 
     def process_input_query(self, query: Select) -> tuple[Select, list["Table"]]:
-        """Apply any necessary processing to the input query"""
-        return query, []
+        """Materialize inputs, ensure sys columns are available, needed for checkpoints,
+        needed for map to work (merge results)"""
+        table = self.catalog.warehouse.create_pre_udf_table(query)
+        return sqlalchemy.select(*table.c), [table]
 
     @abstractmethod
     def create_result_query(
@@ -675,13 +677,6 @@ def create_udf_table(self, query: Select) -> "Table":
 
         return self.catalog.warehouse.create_udf_table(udf_output_columns)
 
-    def process_input_query(self, query: Select) -> tuple[Select, list["Table"]]:
-        if os.getenv("DATACHAIN_DISABLE_QUERY_CACHE", "") not in ("", "0"):
-            return query, []
-        table = self.catalog.warehouse.create_pre_udf_table(query)
-        q: Select = sqlalchemy.select(*table.c)
-        return q, [table]
-
     def create_result_query(
         self, udf_table, query
     ) -> tuple[QueryGeneratorFunc, list["sqlalchemy.Column"]]:
diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
@@ -1699,10 +1699,16 @@ def process(filename: list[str]) -> Iterator[tuple[str, int]]:
         value=list(range(100)),
         session=catalog_tmpfile.session,
     )
+    # Read values in general doesn't guarantee order, so we need to order first
+    ds = ds.order_by("filename")
     if offset is not None:
         ds = ds.offset(offset)
     if limit is not None:
         ds = ds.limit(limit)
+
+    limited_filenames = ds.to_values("filename")
+    assert set(limited_filenames) == set(files)
+
     ds = (
         ds.settings(parallel=parallel)
         .agg(
diff --git a/tests/func/test_union.py b/tests/func/test_union.py
@@ -57,3 +57,30 @@ def test_union_parallel_udf_ids_only_no_dup(test_session_tmpfile, monkeypatch):
     assert total == 2 * n
     assert len(distinct_idx) == 2 * n
     assert total == len(distinct_idx)
+
+
+def test_union_parallel_gen_ids_only_no_dup(test_session_tmpfile, monkeypatch):
+    monkeypatch.setattr("datachain.query.dispatch.DEFAULT_BATCH_SIZE", 5, raising=False)
+    n = 30
+
+    x_ids = list(range(n))
+    y_ids = list(range(n, 2 * n))
+
+    x = dc.read_values(idx=x_ids, session=test_session_tmpfile)
+    y = dc.read_values(idx=y_ids, session=test_session_tmpfile)
+
+    xy = x.union(y)
+
+    def expand(idx):
+        yield f"val-{idx}"
+
+    generated = xy.settings(parallel=2).gen(
+        gen=expand,
+        params=("idx",),
+        output={"val": str},
+    )
+
+    values = generated.to_values("val")
+
+    assert len(values) == 2 * n
+    assert set(values) == {f"val-{i}" for i in range(2 * n)}