apache · timsaucer · Mar 17, 2025 · Mar 13, 2025 · Mar 13, 2025 · Mar 13, 2025
diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py
@@ -20,7 +20,7 @@
 import timeit
 
 import datafusion as df
-import pyarrow
+import pyarrow as pa
 from datafusion import (
     RuntimeEnvBuilder,
     SessionConfig,
@@ -37,7 +37,7 @@
 exec(open("./_helpers/helpers.py").read())
 
 
-def ans_shape(batches):
+def ans_shape(batches) -> tuple[int, int]:
     rows, cols = 0, 0
     for batch in batches:
         rows += batch.num_rows
@@ -48,7 +48,7 @@ def ans_shape(batches):
     return rows, cols
 
 
-def execute(df):
+def execute(df) -> list:
     print(df.execution_plan().display_indent())
     return df.collect()
 
@@ -68,14 +68,14 @@ def execute(df):
 src_grp = os.path.join("data", data_name + ".csv")
 print("loading dataset %s" % src_grp, flush=True)
 
-schema = pyarrow.schema(
+schema = pa.schema(
     [
-        ("id4", pyarrow.int32()),
-        ("id5", pyarrow.int32()),
-        ("id6", pyarrow.int32()),
-        ("v1", pyarrow.int32()),
-        ("v2", pyarrow.int32()),
-        ("v3", pyarrow.float64()),
+        ("id4", pa.int32()),
+        ("id5", pa.int32()),
+        ("id6", pa.int32()),
+        ("v1", pa.int32()),
+        ("v2", pa.int32()),
+        ("v3", pa.float64()),
     ]
 )
 
@@ -93,8 +93,8 @@ def execute(df):
 )
 config = (
     SessionConfig()
-    .with_repartition_joins(False)
-    .with_repartition_aggregations(False)
+    .with_repartition_joins(enabled=False)
+    .with_repartition_aggregations(enabled=False)
     .set("datafusion.execution.coalesce_batches", "false")
 )
 ctx = SessionContext(config, runtime)

diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py
@@ -29,7 +29,7 @@
 exec(open("./_helpers/helpers.py").read())
 
 
-def ans_shape(batches):
+def ans_shape(batches) -> tuple[int, int]:
     rows, cols = 0, 0
     for batch in batches:
         rows += batch.num_rows
@@ -57,7 +57,8 @@ def ans_shape(batches):
     os.path.join("data", y_data_name[2] + ".csv"),
 ]
 if len(src_jn_y) != 3:
-    raise Exception("Something went wrong in preparing files used for join")
+    error_msg = "Something went wrong in preparing files used for join"
+    raise Exception(error_msg)
 
 print(
     "loading datasets "

diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py
@@ -21,7 +21,7 @@
 from datafusion import SessionContext
 
 
-def bench(data_path, query_path):
+def bench(data_path, query_path) -> None:
     with open("results.csv", "w") as results:
         # register tables
         start = time.time()
@@ -68,10 +68,7 @@ def bench(data_path, query_path):
             with open(f"{query_path}/q{query}.sql") as f:
                 text = f.read()
                 tmp = text.split(";")
-                queries = []
-                for str in tmp:
-                    if len(str.strip()) > 0:
-                        queries.append(str.strip())
+                queries = [s.strip() for s in tmp if len(s.strip()) > 0]
 
                 try:
                     start = time.time()

diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py
@@ -24,7 +24,7 @@
 from github import Github
 
 
-def print_pulls(repo_name, title, pulls):
+def print_pulls(repo_name, title, pulls) -> None:
     if len(pulls) > 0:
         print(f"**{title}:**")
         print()
@@ -34,7 +34,7 @@ def print_pulls(repo_name, title, pulls):
         print()
 
 
-def generate_changelog(repo, repo_name, tag1, tag2, version):
+def generate_changelog(repo, repo_name, tag1, tag2, version) -> None:
     # get a list of commits between two tags
     print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr)
     comparison = repo.compare(tag1, tag2)
@@ -154,7 +154,7 @@ def generate_changelog(repo, repo_name, tag1, tag2, version):
     )
 
 
-def cli(args=None):
+def cli(args=None) -> None:
     """Process command line arguments."""
     if not args:
         args = sys.argv[1:]

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -73,7 +73,7 @@
 autoapi_python_class_content = "both"
 
 
-def autoapi_skip_member_fn(app, what, name, obj, skip, options):  # noqa: ARG001
+def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool:  # noqa: ARG001
     skip_contents = [
         # Re-exports
         ("class", "datafusion.DataFrame"),
@@ -93,7 +93,7 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options):  # noqa: ARG001
     return skip
 
 
-def setup(sphinx):
+def setup(sphinx) -> None:
     sphinx.connect("autoapi-skip-member", autoapi_skip_member_fn)
 
 

diff --git a/examples/create-context.py b/examples/create-context.py
@@ -25,14 +25,14 @@
 runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000)
 config = (
     SessionConfig()
-    .with_create_default_catalog_and_schema(True)
+    .with_create_default_catalog_and_schema(enabled=True)
     .with_default_catalog_and_schema("foo", "bar")
     .with_target_partitions(8)
-    .with_information_schema(True)
-    .with_repartition_joins(False)
-    .with_repartition_aggregations(False)
-    .with_repartition_windows(False)
-    .with_parquet_pruning(False)
+    .with_information_schema(enabled=True)
+    .with_repartition_joins(enabled=False)
+    .with_repartition_aggregations(enabled=False)
+    .with_repartition_windows(enabled=False)
+    .with_parquet_pruning(enabled=False)
     .set("datafusion.execution.parquet.pushdown_filters", "true")
 )
 ctx = SessionContext(config, runtime)

diff --git a/examples/python-udaf.py b/examples/python-udaf.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import datafusion
-import pyarrow
+import pyarrow as pa
 import pyarrow.compute
 from datafusion import Accumulator, col, udaf
 
@@ -26,48 +26,44 @@ class MyAccumulator(Accumulator):
     Interface of a user-defined accumulation.
     """
 
-    def __init__(self):
-        self._sum = pyarrow.scalar(0.0)
+    def __init__(self) -> None:
+        self._sum = pa.scalar(0.0)
 
-    def update(self, values: pyarrow.Array) -> None:
+    def update(self, values: pa.Array) -> None:
         # not nice since pyarrow scalars can't be summed yet. This breaks on `None`
-        self._sum = pyarrow.scalar(
-            self._sum.as_py() + pyarrow.compute.sum(values).as_py()
-        )
+        self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(values).as_py())
 
-    def merge(self, states: pyarrow.Array) -> None:
+    def merge(self, states: pa.Array) -> None:
         # not nice since pyarrow scalars can't be summed yet. This breaks on `None`
-        self._sum = pyarrow.scalar(
-            self._sum.as_py() + pyarrow.compute.sum(states).as_py()
-        )
+        self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(states).as_py())
 
-    def state(self) -> pyarrow.Array:
-        return pyarrow.array([self._sum.as_py()])
+    def state(self) -> pa.Array:
+        return pa.array([self._sum.as_py()])
 
-    def evaluate(self) -> pyarrow.Scalar:
+    def evaluate(self) -> pa.Scalar:
         return self._sum
 
 
 # create a context
 ctx = datafusion.SessionContext()
 
 # create a RecordBatch and a new DataFrame from it
-batch = pyarrow.RecordBatch.from_arrays(
-    [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
+batch = pa.RecordBatch.from_arrays(
+    [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
     names=["a", "b"],
 )
 df = ctx.create_dataframe([[batch]])
 
 my_udaf = udaf(
     MyAccumulator,
-    pyarrow.float64(),
-    pyarrow.float64(),
-    [pyarrow.float64()],
+    pa.float64(),
+    pa.float64(),
+    [pa.float64()],
     "stable",
 )
 
 df = df.aggregate([], [my_udaf(col("a"))])
 
 result = df.collect()[0]
 
-assert result.column(0) == pyarrow.array([6.0])
+assert result.column(0) == pa.array([6.0])
diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py
@@ -163,9 +163,9 @@ def udf_using_pyarrow_compute_impl(
         resultant_arr = pc.and_(resultant_arr, filtered_returnflag_arr)
 
         if results is None:
-            results = resultant_arr
-        else:
-            results = pc.or_(results, resultant_arr)
+            results = (
+                resultant_arr if results is None else pc.or_(results, resultant_arr)
+            )
 
     return results
 

diff --git a/examples/python-udf.py b/examples/python-udf.py
@@ -15,23 +15,23 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pyarrow
+import pyarrow as pa
 from datafusion import SessionContext, udf
 from datafusion import functions as f
 
 
-def is_null(array: pyarrow.Array) -> pyarrow.Array:
+def is_null(array: pa.Array) -> pa.Array:
     return array.is_null()
 
 
-is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), "stable")
+is_null_arr = udf(is_null, [pa.int64()], pa.bool_(), "stable")
 
 # create a context
 ctx = SessionContext()
 
 # create a RecordBatch and a new DataFrame from it
-batch = pyarrow.RecordBatch.from_arrays(
-    [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
+batch = pa.RecordBatch.from_arrays(
+    [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
     names=["a", "b"],
 )
 df = ctx.create_dataframe([[batch]])
@@ -40,4 +40,4 @@ def is_null(array: pyarrow.Array) -> pyarrow.Array:
 
 result = df.collect()[0]
 
-assert result.column(0) == pyarrow.array([False] * 3)
+assert result.column(0) == pa.array([False] * 3)
diff --git a/examples/query-pyarrow-data.py b/examples/query-pyarrow-data.py
@@ -16,15 +16,15 @@
 # under the License.
 
 import datafusion
-import pyarrow
+import pyarrow as pa
 from datafusion import col
 
 # create a context
 ctx = datafusion.SessionContext()
 
 # create a RecordBatch and a new DataFrame from it
-batch = pyarrow.RecordBatch.from_arrays(
-    [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
+batch = pa.RecordBatch.from_arrays(
+    [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
     names=["a", "b"],
 )
 df = ctx.create_dataframe([[batch]])
@@ -38,5 +38,5 @@
 # execute and collect the first (and only) batch
 result = df.collect()[0]
 
-assert result.column(0) == pyarrow.array([5, 7, 9])
-assert result.column(1) == pyarrow.array([-3, -3, -3])
+assert result.column(0) == pa.array([5, 7, 9])
+assert result.column(1) == pa.array([-3, -3, -3])
diff --git a/examples/sql-using-python-udaf.py b/examples/sql-using-python-udaf.py
@@ -25,7 +25,7 @@ class MyAccumulator(Accumulator):
     Interface of a user-defined accumulation.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self._sum = pa.scalar(0.0)
 
     def update(self, values: pa.Array) -> None: