iterative
diff --git a/‎tests/func/test_datachain.py‎
Lines changed: 1 addition & 373 deletions b/‎tests/func/test_datachain.py‎
Lines changed: 1 addition & 373 deletions
@@ -16,7 +16,7 @@
 from PIL import Image
 
 import datachain as dc
-from datachain import DataChainError, DataModel, Mapper, func
+from datachain import DataModel, func
 from datachain.data_storage.sqlite import SQLiteWarehouse
 from datachain.dataset import DatasetDependencyType
 from datachain.lib.file import File, ImageFile
@@ -25,7 +25,6 @@
 from datachain.query.dataset import QueryStep
 from tests.utils import (
     ANY_VALUE,
-    LARGE_TREE,
     TARRED_TREE,
     df_equal,
     images_equal,
@@ -734,377 +733,6 @@ def test_parallel(processes, test_session_tmpfile):
     assert res == [prefix + v for v in vals]
 
 
-@pytest.mark.parametrize(
-    "cloud_type,version_aware",
-    [("s3", True)],
-    indirect=True,
-)
-def test_udf(cloud_test_catalog):
-    session = cloud_test_catalog.session
-
-    def name_len(path):
-        return (len(posixpath.basename(path)),)
-
-    chain = (
-        dc.read_storage(cloud_test_catalog.src_uri, session=session)
-        .filter(dc.C("file.size") < 13)
-        .filter(dc.C("file.path").glob("cats*") | (dc.C("file.size") < 4))
-        .map(name_len, params=["file.path"], output={"name_len": int})
-    )
-    result1 = chain.select("file.path", "name_len").to_list()
-    # ensure that we're able to run with same query multiple times
-    result2 = chain.select("file.path", "name_len").to_list()
-    count = chain.count()
-    assert len(result1) == 3
-    assert len(result2) == 3
-    assert count == 3
-
-    for r1, r2 in zip(result1, result2, strict=False):
-        # Check that the UDF ran successfully
-        assert len(posixpath.basename(r1[0])) == r1[1]
-        assert len(posixpath.basename(r2[0])) == r2[1]
-
-
-@pytest.mark.parametrize(
-    "cloud_type,version_aware",
-    [("s3", True)],
-    indirect=True,
-)
-@pytest.mark.xdist_group(name="tmpfile")
-def test_udf_parallel(cloud_test_catalog_tmpfile):
-    session = cloud_test_catalog_tmpfile.session
-
-    def name_len(name):
-        return (len(name),)
-
-    chain = (
-        dc.read_storage(cloud_test_catalog_tmpfile.src_uri, session=session)
-        .settings(parallel=True)
-        .map(name_len, params=["file.path"], output={"name_len": int})
-        .select("file.path", "name_len")
-    )
-
-    # Check that the UDF ran successfully
-    count = 0
-    for r in chain:
-        count += 1
-        assert len(r[0]) == r[1]
-    assert count == 7
-
-
-@pytest.mark.xdist_group(name="tmpfile")
-def test_udf_parallel_boostrap(test_session_tmpfile):
-    vals = ["a", "b", "c", "d", "e", "f"]
-
-    class MyMapper(Mapper):
-        DEFAULT_VALUE = 84
-        BOOTSTRAP_VALUE = 1452
-        TEARDOWN_VALUE = 98763
-
-        def __init__(self):
-            super().__init__()
-            self.value = MyMapper.DEFAULT_VALUE
-            self._had_teardown = False
-
-        def process(self, key) -> int:
-            return self.value
-
-        def setup(self):
-            self.value = MyMapper.BOOTSTRAP_VALUE
-
-        def teardown(self):
-            self.value = MyMapper.TEARDOWN_VALUE
-
-    chain = dc.read_values(key=vals, session=test_session_tmpfile)
-
-    res = chain.settings(parallel=4).map(res=MyMapper()).to_values("res")
-
-    assert res == [MyMapper.BOOTSTRAP_VALUE] * len(vals)
-
-
-@pytest.mark.parametrize(
-    "cloud_type,version_aware,tree",
-    [("s3", True, LARGE_TREE)],
-    indirect=True,
-)
-@pytest.mark.parametrize("workers", (1, 2))
-@pytest.mark.parametrize("parallel", (1, 2))
-@pytest.mark.skipif(
-    "not os.environ.get('DATACHAIN_DISTRIBUTED')",
-    reason="Set the DATACHAIN_DISTRIBUTED environment variable "
-    "to test distributed UDFs",
-)
-@pytest.mark.xdist_group(name="tmpfile")
-def test_udf_distributed(
-    cloud_test_catalog_tmpfile, workers, parallel, tree, run_datachain_worker
-):
-    session = cloud_test_catalog_tmpfile.session
-
-    def name_len(name):
-        return (len(name),)
-
-    chain = (
-        dc.read_storage(cloud_test_catalog_tmpfile.src_uri, session=session)
-        .settings(parallel=parallel, workers=workers)
-        .map(name_len, params=["file.path"], output={"name_len": int})
-        .select("file.path", "name_len")
-    )
-
-    # Check that the UDF ran successfully
-    count = 0
-    for r in chain:
-        count += 1
-        assert len(r[0]) == r[1]
-    assert count == 225
-
-
-@pytest.mark.parametrize(
-    "cloud_type,version_aware",
-    [("s3", True)],
-    indirect=True,
-)
-def test_class_udf(cloud_test_catalog):
-    session = cloud_test_catalog.session
-
-    class MyUDF(Mapper):
-        def __init__(self, constant, multiplier=1):
-            self.constant = constant
-            self.multiplier = multiplier
-
-        def process(self, size):
-            return (self.constant + size * self.multiplier,)
-
-    chain = (
-        dc.read_storage(cloud_test_catalog.src_uri, session=session)
-        .filter(dc.C("file.size") < 13)
-        .map(
-            MyUDF(5, multiplier=2),
-            output={"total": int},
-            params=["file.size"],
-        )
-        .select("file.size", "total")
-        .order_by("file.size")
-    )
-
-    assert chain.to_list() == [
-        (3, 11),
-        (4, 13),
-        (4, 13),
-        (4, 13),
-        (4, 13),
-        (4, 13),
-    ]
-
-
-@pytest.mark.parametrize(
-    "cloud_type,version_aware",
-    [("s3", True)],
-    indirect=True,
-)
-@pytest.mark.xdist_group(name="tmpfile")
-def test_class_udf_parallel(cloud_test_catalog_tmpfile):
-    session = cloud_test_catalog_tmpfile.session
-
-    class MyUDF(Mapper):
-        def __init__(self, constant, multiplier=1):
-            self.constant = constant
-            self.multiplier = multiplier
-
-        def process(self, size):
-            return (self.constant + size * self.multiplier,)
-
-    chain = (
-        dc.read_storage(cloud_test_catalog_tmpfile.src_uri, session=session)
-        .filter(dc.C("file.size") < 13)
-        .settings(parallel=2)
-        .map(
-            MyUDF(5, multiplier=2),
-            output={"total": int},
-            params=["file.size"],
-        )
-        .select("file.size", "total")
-        .order_by("file.size")
-    )
-
-    assert chain.to_list() == [
-        (3, 11),
-        (4, 13),
-        (4, 13),
-        (4, 13),
-        (4, 13),
-        (4, 13),
-    ]
-
-
-@pytest.mark.parametrize(
-    "cloud_type,version_aware",
-    [("s3", True)],
-    indirect=True,
-)
-@pytest.mark.xdist_group(name="tmpfile")
-def test_udf_parallel_exec_error(cloud_test_catalog_tmpfile):
-    session = cloud_test_catalog_tmpfile.session
-
-    def name_len_error(_name):
-        # A udf that raises an exception
-        raise RuntimeError("Test Error!")
-
-    chain = (
-        dc.read_storage(cloud_test_catalog_tmpfile.src_uri, session=session)
-        .filter(dc.C("file.size") < 13)
-        .filter(dc.C("file.path").glob("cats*") | (dc.C("file.size") < 4))
-        .settings(parallel=True)
-        .map(name_len_error, params=["file.path"], output={"name_len": int})
-    )
-
-    if os.environ.get("DATACHAIN_DISTRIBUTED"):
-        # in distributed mode we expect DataChainError with the error message
-        with pytest.raises(DataChainError, match="Test Error!"):
-            chain.show()
-    else:
-        # while in local mode we expect RuntimeError with the error message
-        with pytest.raises(RuntimeError, match="UDF Execution Failed!"):
-            chain.show()
-
-
-@pytest.mark.parametrize(
-    "cloud_type,version_aware,tree",
-    [("s3", True, LARGE_TREE)],
-    indirect=True,
-)
-@pytest.mark.parametrize("workers", (1, 2))
-@pytest.mark.parametrize("parallel", (1, 2))
-@pytest.mark.skipif(
-    "not os.environ.get('DATACHAIN_DISTRIBUTED')",
-    reason="Set the DATACHAIN_DISTRIBUTED environment variable "
-    "to test distributed UDFs",
-)
-@pytest.mark.xdist_group(name="tmpfile")
-def test_udf_distributed_exec_error(
-    cloud_test_catalog_tmpfile, workers, parallel, tree, run_datachain_worker
-):
-    session = cloud_test_catalog_tmpfile.session
-
-    def name_len_error(_name):
-        # A udf that raises an exception
-        raise RuntimeError("Test Error!")
-
-    chain = (
-        dc.read_storage(cloud_test_catalog_tmpfile.src_uri, session=session)
-        .filter(dc.C("file.size") < 13)
-        .filter(dc.C("file.path").glob("cats*") | (dc.C("file.size") < 4))
-        .settings(parallel=parallel, workers=workers)
-        .map(name_len_error, params=["file.path"], output={"name_len": int})
-    )
-    with pytest.raises(DataChainError, match="Test Error!"):
-        chain.show()
-
-
-@pytest.mark.parametrize(
-    "cloud_type,version_aware",
-    [("s3", True)],
-    indirect=True,
-)
-@pytest.mark.xdist_group(name="tmpfile")
-def test_udf_reuse_on_error(cloud_test_catalog_tmpfile):
-    session = cloud_test_catalog_tmpfile.session
-
-    error_state = {"error": True}
-
-    def name_len_maybe_error(path):
-        if error_state["error"]:
-            # A udf that raises an exception
-            raise RuntimeError("Test Error!")
-        return (len(path),)
-
-    chain = (
-        dc.read_storage(cloud_test_catalog_tmpfile.src_uri, session=session)
-        .filter(dc.C("file.size") < 13)
-        .filter(dc.C("file.path").glob("cats*") | (dc.C("file.size") < 4))
-        .map(name_len_maybe_error, params=["file.path"], output={"path_len": int})
-        .select("file.path", "path_len")
-    )
-
-    with pytest.raises(DataChainError, match="Test Error!"):
-        chain.show()
-
-    # Simulate fixing the error
-    error_state["error"] = False
-
-    # Retry Query
-    count = 0
-    for r in chain:
-        # Check that the UDF ran successfully
-        count += 1
-        assert len(r[0]) == r[1]
-    assert count == 3
-
-
-@pytest.mark.parametrize(
-    "cloud_type,version_aware",
-    [("s3", True)],
-    indirect=True,
-)
-@pytest.mark.xdist_group(name="tmpfile")
-def test_udf_parallel_interrupt(cloud_test_catalog_tmpfile, capfd):
-    session = cloud_test_catalog_tmpfile.session
-
-    def name_len_interrupt(_name):
-        # A UDF that emulates cancellation due to a KeyboardInterrupt.
-        raise KeyboardInterrupt
-
-    chain = (
-        dc.read_storage(cloud_test_catalog_tmpfile.src_uri, session=session)
-        .filter(dc.C("file.size") < 13)
-        .filter(dc.C("file.path").glob("cats*") | (dc.C("file.size") < 4))
-        .settings(parallel=True)
-        .map(name_len_interrupt, params=["file.path"], output={"name_len": int})
-    )
-    if os.environ.get("DATACHAIN_DISTRIBUTED"):
-        with pytest.raises(KeyboardInterrupt):
-            chain.show()
-    else:
-        with pytest.raises(RuntimeError, match="UDF Execution Failed!"):
-            chain.show()
-    captured = capfd.readouterr()
-    assert "semaphore" not in captured.err
-
-
-@pytest.mark.parametrize(
-    "cloud_type,version_aware,tree",
-    [("s3", True, LARGE_TREE)],
-    indirect=True,
-)
-@pytest.mark.skipif(
-    "not os.environ.get('DATACHAIN_DISTRIBUTED')",
-    reason="Set the DATACHAIN_DISTRIBUTED environment variable "
-    "to test distributed UDFs",
-)
-@pytest.mark.parametrize("workers", (1, 2))
-@pytest.mark.parametrize("parallel", (1, 2))
-@pytest.mark.xdist_group(name="tmpfile")
-def test_udf_distributed_interrupt(
-    cloud_test_catalog_tmpfile, capfd, tree, workers, parallel, run_datachain_worker
-):
-    session = cloud_test_catalog_tmpfile.session
-
-    def name_len_interrupt(_name):
-        # A UDF that emulates cancellation due to a KeyboardInterrupt.
-        raise KeyboardInterrupt
-
-    chain = (
-        dc.read_storage(cloud_test_catalog_tmpfile.src_uri, session=session)
-        .filter(dc.C("file.size") < 13)
-        .filter(dc.C("file.path").glob("cats*") | (dc.C("file.size") < 4))
-        .settings(parallel=parallel, workers=workers)
-        .map(name_len_interrupt, params=["file.path"], output={"name_len": int})
-    )
-    with pytest.raises(KeyboardInterrupt):
-        chain.show()
-    captured = capfd.readouterr()
-    assert "semaphore" not in captured.err
-
-
 @pytest.mark.parametrize(
     "cloud_type,version_aware",
     [("s3", True)],