convert transformation functions to need yield instead of return

sh-rp · sh-rp · commit 3ca690c0c699 · 2025-06-26T14:00:57.000+02:00
diff --git a/dlt/transformations/transformation.py b/dlt/transformations/transformation.py
@@ -3,9 +3,9 @@
 from typing import Callable, Any, Optional, Type, Iterator, List
 
 import dlt
+import sqlglot
 
 from dlt.common.configuration.inject import get_fun_last_config, get_fun_spec
-from dlt.common.reflection.inspect import isgeneratorfunction
 from dlt.common.typing import TDataItems, TTableHintTemplate
 from dlt.common import logger
 
@@ -16,7 +16,6 @@
 from dlt.transformations.typing import TTransformationFunParams
 from dlt.transformations.exceptions import (
     TransformationException,
-    TransformationInvalidReturnTypeException,
     IncompatibleDatasetsException,
 )
 from dlt.pipeline.exceptions import PipelineConfigMissing
@@ -32,7 +31,6 @@
 from dlt.transformations.configuration import TransformationConfiguration
 from dlt.common.utils import get_callable_name
 from dlt.extract.exceptions import CurrentSourceNotAvailable
-from dlt.common.schema.typing import TPartialTableSchema
 from dlt.extract.pipe_iterator import DataItemWithMeta
 
 
@@ -82,7 +80,6 @@ def make_transformation_resource(
     section: Optional[TTableHintTemplate[str]],
 ) -> DltTransformationResource:
     resource_name = name if name and not callable(name) else get_callable_name(func)
-    is_regular_resource = isgeneratorfunction(func)
 
     if spec and not issubclass(spec, TransformationConfiguration):
         raise TransformationException(
@@ -92,16 +89,53 @@ def make_transformation_resource(
 
     @wraps(func)
     def transformation_function(*args: Any, **kwargs: Any) -> Iterator[TDataItems]:
-        config: TransformationConfiguration = (
-            get_fun_last_config(func) or get_fun_spec(func)()  # type: ignore[assignment]
-        )
-
         # Collect all datasets from args and kwargs
         all_arg_values = list(args) + list(kwargs.values())
         datasets: List[ReadableDBAPIDataset] = [
             arg for arg in all_arg_values if isinstance(arg, ReadableDBAPIDataset)
         ]
 
+        # get first item from gen and see what we're dealing with
+        gen = func(*args, **kwargs)
+        original_first_item = next(gen)
+
+        # unwrap if needed
+        meta = None
+        unwrapped_item = original_first_item
+        relation = None
+        if isinstance(original_first_item, DataItemWithMeta):
+            meta = original_first_item.meta
+            unwrapped_item = original_first_item.data
+
+        # catch the two cases where we get a relation from the transformation function
+        # NOTE: we only process the first item, all other things that are still in the generator are ignored
+        if isinstance(unwrapped_item, BaseReadableDBAPIRelation):
+            relation = unwrapped_item
+        # we see if the string is a valid sql query, if so we need a dataset
+        elif isinstance(unwrapped_item, str):
+            try:
+                sqlglot.parse_one(unwrapped_item)
+                if len(datasets) == 0:
+                    raise IncompatibleDatasetsException(
+                        resource_name,
+                        "No datasets found in transformation function arguments. Please supply all"
+                        " used datasets via transform function arguments.",
+                    )
+                else:
+                    relation = datasets[0](unwrapped_item)
+            except sqlglot.errors.ParseError:
+                pass
+
+        # we have something else, so fall back to regular resource behavior
+        if not relation:
+            yield original_first_item
+            yield from gen
+            return
+
+        config: TransformationConfiguration = (
+            get_fun_last_config(func) or get_fun_spec(func)()  # type: ignore[assignment]
+        )
+
         # Warn if Incremental arguments are present
         for arg_name, param in inspect.signature(func).parameters.items():
             if param.annotation is Incremental or isinstance(param.default, Incremental):
@@ -138,31 +172,8 @@ def transformation_function(*args: Any, **kwargs: Any) -> Iterator[TDataItems]:
         # respect always materialize config
         should_materialize = should_materialize or config.always_materialize
 
-        # Call the transformation function
-        transformation_result: Any = func(*args, **kwargs)
-
-        # unwrap meta
-        meta = None
-        if isinstance(transformation_result, DataItemWithMeta):
-            meta = transformation_result.meta
-            transformation_result = transformation_result.data
-
-        # If a string is returned, construct relation from first dataset from it
-        if isinstance(transformation_result, BaseReadableDBAPIRelation):
-            relation = transformation_result
-        elif isinstance(transformation_result, str):
-            relation = datasets[0](transformation_result)
-        else:
-            raise TransformationInvalidReturnTypeException(
-                resource_name,
-                "Sql Transformation %s returned an invalid type: %s. Please either return a valid"
-                " sql string or Ibis / data frame expression from a dataset. If you want to return"
-                " data (data frames / arrow table), please yield those, not return."
-                % (name, type(transformation_result)),
-            )
-
+        # build model if needed
         sql_model = MaterializableSqlModel.from_relation(relation)
-
         if not should_materialize:
             if meta:
                 yield DataItemWithMeta(meta, sql_model)
@@ -188,6 +199,4 @@ def transformation_function(*args: Any, **kwargs: Any) -> Iterator[TDataItems]:
         section=section,
         _impl_cls=DltTransformationResource,
         _base_spec=TransformationConfiguration,
-    )(
-        func if is_regular_resource else transformation_function  # type: ignore[arg-type]
-    )
+    )(transformation_function)
diff --git a/tests/transformations/test_transformation_decorator.py b/tests/transformations/test_transformation_decorator.py
@@ -22,17 +22,23 @@
 
 
 def test_no_datasets_used() -> None:
+    # valid sql string with out dataset will raise
     with pytest.raises(IncompatibleDatasetsException) as excinfo:
 
         @dlt.transformation()
         def transform() -> Any:
-            return {"some": "data"}
+            yield "SELECT * FROM table1"
 
         list(transform())
 
-    assert "No datasets detected in transformation. Please supply all used datasets via" in str(
-        excinfo.value
-    )
+    assert "No datasets found in transformation function arguments" in str(excinfo.value)
+
+    # invalid sql string without dataset will be interpreted as string item
+    @dlt.transformation()
+    def other_transform() -> Any:
+        yield "Hello I am a string"
+
+    assert list(other_transform()) == ["Hello I am a string"]
 
 
 def test_iterator_function_as_transform_function() -> None:
@@ -44,19 +50,6 @@ def transform(dataset: SupportsReadableDataset[Any]) -> Any:
     assert list(transform(dlt.dataset("duckdb", "dataset_name"))) == [{"some": "data"}]
 
 
-def test_incorrect_transform_function_return_type() -> None:
-    p = dlt.pipeline("test_pipeline", destination="duckdb")
-
-    @dlt.transformation()
-    def transform(dataset: SupportsReadableDataset[Any]) -> Any:
-        return {"some": "data"}
-
-    with pytest.raises(PipelineStepFailed) as excinfo:
-        p.run(transform(dlt.dataset(dlt.destinations.duckdb("input_data"), "dataset_name")))
-
-    assert "Please either return a valid sql string or" in str(excinfo.value)
-
-
 def test_incremental_argument_is_not_supported(caplog: LogCaptureFixture) -> None:
     # test incremental default arg
     with patch.object(logger, "warning") as mock_warning:
@@ -68,7 +61,7 @@ def transform_1(
                 dataset: SupportsReadableDataset[Any],
                 incremental_arg=dlt.sources.incremental("col1"),
             ) -> Any:
-                return "SELECT col1 FROM table1"
+                yield "SELECT col1 FROM table1"
 
             list(transform_1(dlt.dataset("duckdb", "dataset_name")))
 
@@ -127,7 +120,7 @@ def default_spec(dataset: dlt.Dataset):
         assert type(config) is not TransformationConfiguration
         # config got passed
         assert config.buffer_max_items == 100
-        return "SELECT col1 FROM table1"
+        yield "SELECT col1 FROM table1"
 
     schema = Schema("_data")
     schema.update_table(new_table("table1", columns=[{"name": "col1", "data_type": "text"}]))
@@ -146,7 +139,7 @@ def default_transformation_with_args(
         dataset: dlt.Dataset, last_id: str = dlt.config.value, limit: int = 5
     ):
         assert last_id == "test_last_id"
-        return dataset.table1[["col1"]]
+        yield dataset.table1[["col1"]]
 
     spec = get_fun_spec(default_transformation_with_args)
     assert "last_id" in spec().get_resolvable_fields()
@@ -178,7 +171,7 @@ def default_transformation_spec(
         assert limit == 100
 
         table1_ = dataset(f"SELECT * FROM table1 WHERE col1 = '{last_idx}' LIMIT {limit}")
-        return table1_
+        yield table1_
 
     assert default_transformation_spec.name == "default_name_ovr"
     assert default_transformation_spec.section == "default_name_ovr"
@@ -191,8 +184,6 @@ def default_transformation_spec(
     assert isinstance(model, SqlModel)
     query = model.query
     # make sure we have our args in query
-    print(model)
-    print(query)
     assert "uniq_last_id" in query
     assert "100" in query
 
diff --git a/tests/transformations/test_transformations.py b/tests/transformations/test_transformations.py
@@ -42,13 +42,13 @@ def test_simple_query_transformations(
 
         @dlt.transformation()
         def copied_purchases(dataset: SupportsReadableDataset[Any]) -> Any:
-            return """SELECT * FROM purchases LIMIT 3"""
+            yield """SELECT * FROM purchases LIMIT 3"""
 
     elif transformation_type == "relation":
 
         @dlt.transformation()
         def copied_purchases(dataset: SupportsReadableDataset[Any]) -> Any:
-            return dataset["purchases"].limit(3)
+            yield dataset["purchases"].limit(3)
 
     # transform into transformed dataset
     os.environ["ALWAYS_MATERIALIZE"] = str(always_materialize)
@@ -87,12 +87,12 @@ def test_transformations_with_supplied_hints(
     # we can now transform this table twice, one with changed hints and once with the original hints
     @dlt.transformation()
     def inventory_original(dataset: SupportsReadableDataset[Any]) -> Any:
-        return dataset["inventory"]
+        yield dataset["inventory"]
 
     @dlt.transformation()
     def inventory_more_precise(dataset: SupportsReadableDataset[Any]) -> Any:
         hints = make_hints(columns=[{"name": "price", "precision": 20, "scale": 2}])
-        return dlt.mark.with_hints(dataset["inventory"], hints=hints)
+        yield dlt.mark.with_hints(dataset["inventory"], hints=hints)
 
     dest_p.run([inventory_original(fruit_p.dataset()), inventory_more_precise(fruit_p.dataset())])
 
@@ -119,7 +119,7 @@ def test_extract_without_source_name_or_pipeline(
 
     @dlt.transformation()
     def buffer_size_test(dataset: SupportsReadableDataset[Any]) -> Any:
-        return dataset["customers"]
+        yield dataset["customers"]
 
     # transformations switch to model extraction
     fruit_p.deactivate()
@@ -139,7 +139,7 @@ def test_extract_without_destination(destination_config: DestinationTestConfigur
 
     @dlt.transformation()
     def extract_test(dataset: SupportsReadableDataset[Any]) -> Any:
-        return dataset["customers"]
+        yield dataset["customers"]
 
     pipeline_no_destination = dlt.pipeline(pipeline_name="no_destination")
     pipeline_no_destination._destination = None