apache · timsaucer · Mar 12, 2025 · Mar 9, 2025 · Mar 9, 2025 · Mar 9, 2025
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -33,9 +33,11 @@ jobs:
       fail-fast: false
       matrix:
         python-version:
+          - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"
+          - "3.13"
         toolchain:
           - "stable"
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
           - id: actionlint-docker
       - repo: https://github.com/astral-sh/ruff-pre-commit
         # Ruff version.
-        rev: v0.3.0
+        rev: v0.9.10
         hooks:
           # Run the linter.
           - id: ruff

diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py
@@ -59,13 +59,13 @@ def bench(data_path, query_path):
         end = time.time()
         time_millis = (end - start) * 1000
         total_time_millis += time_millis
-        print("setup,{}".format(round(time_millis, 1)))
-        results.write("setup,{}\n".format(round(time_millis, 1)))
+        print(f"setup,{round(time_millis, 1)}")
+        results.write(f"setup,{round(time_millis, 1)}\n")
         results.flush()
 
         # run queries
         for query in range(1, 23):
-            with open("{}/q{}.sql".format(query_path, query)) as f:
+            with open(f"{query_path}/q{query}.sql") as f:
                 text = f.read()
                 tmp = text.split(";")
                 queries = []
@@ -83,14 +83,14 @@ def bench(data_path, query_path):
                     end = time.time()
                     time_millis = (end - start) * 1000
                     total_time_millis += time_millis
-                    print("q{},{}".format(query, round(time_millis, 1)))
-                    results.write("q{},{}\n".format(query, round(time_millis, 1)))
+                    print(f"q{query},{round(time_millis, 1)}")
+                    results.write(f"q{query},{round(time_millis, 1)}\n")
                     results.flush()
                 except Exception as e:
                     print("query", query, "failed", e)
 
-        print("total,{}".format(round(total_time_millis, 1)))
-        results.write("total,{}\n".format(round(total_time_millis, 1)))
+        print(f"total,{round(total_time_millis, 1)}")
+        results.write(f"total,{round(total_time_millis, 1)}\n")
 
 
 if __name__ == "__main__":

diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py
@@ -29,7 +29,7 @@
 exclude_globs_filename = sys.argv[1]
 xml_filename = sys.argv[2]
 
-globs = [line.strip() for line in open(exclude_globs_filename, "r")]
+globs = [line.strip() for line in open(exclude_globs_filename)]
 
 tree = ET.parse(xml_filename)
 root = tree.getroot()

diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py
@@ -26,15 +26,11 @@
 
 def print_pulls(repo_name, title, pulls):
     if len(pulls) > 0:
-        print("**{}:**".format(title))
+        print(f"**{title}:**")
         print()
         for pull, commit in pulls:
-            url = "https://github.com/{}/pull/{}".format(repo_name, pull.number)
-            print(
-                "- {} [#{}]({}) ({})".format(
-                    pull.title, pull.number, url, commit.author.login
-                )
-            )
+            url = f"https://github.com/{repo_name}/pull/{pull.number}"
+            print(f"- {pull.title} [#{pull.number}]({url}) ({commit.author.login})")
         print()
 
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -73,7 +73,7 @@
 autoapi_python_class_content = "both"
 
 
-def autoapi_skip_member_fn(app, what, name, obj, skip, options):
+def autoapi_skip_member_fn(app, what, name, obj, skip, options):  # noqa: ARG001
     skip_contents = [
         # Re-exports
         ("class", "datafusion.DataFrame"),

diff --git a/examples/python-udwf.py b/examples/python-udwf.py
@@ -59,7 +59,7 @@ def __init__(self, alpha: float) -> None:
     def supports_bounded_execution(self) -> bool:
         return True
 
-    def get_range(self, idx: int, num_rows: int) -> tuple[int, int]:
+    def get_range(self, idx: int, num_rows: int) -> tuple[int, int]:  # noqa: ARG002
         # Override the default range of current row since uses_window_frame is False
         # So for the purpose of this test we just smooth from the previous row to
         # current.

diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py
@@ -27,28 +27,25 @@
 def df_selection(col_name, col_type):
     if col_type == pa.float64() or isinstance(col_type, pa.Decimal128Type):
         return F.round(col(col_name), lit(2)).alias(col_name)
-    elif col_type == pa.string() or col_type == pa.string_view():
+    if col_type == pa.string() or col_type == pa.string_view():
         return F.trim(col(col_name)).alias(col_name)
-    else:
-        return col(col_name)
+    return col(col_name)
 
 
 def load_schema(col_name, col_type):
     if col_type == pa.int64() or col_type == pa.int32():
         return col_name, pa.string()
-    elif isinstance(col_type, pa.Decimal128Type):
+    if isinstance(col_type, pa.Decimal128Type):
         return col_name, pa.float64()
-    else:
-        return col_name, col_type
+    return col_name, col_type
 
 
 def expected_selection(col_name, col_type):
     if col_type == pa.int64() or col_type == pa.int32():
         return F.trim(col(col_name)).cast(col_type).alias(col_name)
-    elif col_type == pa.string() or col_type == pa.string_view():
+    if col_type == pa.string() or col_type == pa.string_view():
         return F.trim(col(col_name)).alias(col_name)
-    else:
-        return col(col_name)
+    return col(col_name)
 
 
 def selections_and_schema(original_schema):

diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,7 @@ name = "datafusion"
 description = "Build and run queries against data"
 readme = "README.md"
 license = { file = "LICENSE.txt" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 keywords = ["datafusion", "dataframe", "rust", "query-engine"]
 classifiers = [
     "Development Status :: 2 - Pre-Alpha",
@@ -35,7 +35,6 @@ classifiers = [
     "Operating System :: Microsoft :: Windows",
     "Operating System :: POSIX :: Linux",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
@@ -66,7 +65,57 @@ features = ["substrait"]
 
 # Enable docstring linting using the google style guide
 [tool.ruff.lint]
-select = ["E4", "E7", "E9", "F", "FA", "D", "W", "I"]
+select = ["ALL" ]
+ignore = [
+    "A001",    # Allow using words like min as variable names
+    "A002",    # Allow using words like filter as variable names
+    "ANN401",  # Allow Any for wrapper classes
+    "COM812",  # Recommended to ignore these rules when using with ruff-format
+    "FIX002",  # Allow TODO lines - consider removing at some point
+    "FBT001",  # Allow boolean positional args
+    "FBT002",  # Allow boolean positional args
+    "ISC001",  # Recommended to ignore these rules when using with ruff-format
+    "SLF001",  # Allow accessing private members
+    "TD002",
+    "TD003",   # Allow TODO lines
+    "UP007",   # Disallowing Union is pedantic
+    # TODO: Enable all of the following, but this PR is getting too large already
+    "PT001",
+    "ANN204",
+    "B008",
+    "EM101",
+    "PLR0913",
+    "PLR1714",
+    "ANN201",
+    "C400",
+    "TRY003",
+    "B904",
+    "UP006",
+    "RUF012",
+    "FBT003",
+    "C416",
+    "SIM102",
+    "PGH003",
+    "PLR2004",
+    "PERF401",
+    "PD901",
+    "EM102",
+    "ERA001",
+    "SIM108",
+    "ICN001",
+    "ANN001",
+    "ANN202",
+    "PTH",
+    "N812",
+    "INP001",
+    "DTZ007",
+    "PLW2901",
+    "RET503",
+    "RUF015",
+    "A005",
+    "TC001",
+    "UP035",
+]
 
 [tool.ruff.lint.pydocstyle]
 convention = "google"
@@ -76,16 +125,30 @@ max-doc-length = 88
 
 # Disable docstring checking for these directories
 [tool.ruff.lint.per-file-ignores]
-"python/tests/*" = ["D"]
-"examples/*" = ["D", "W505"]
-"dev/*" = ["D"]
-"benchmarks/*" = ["D", "F"]
+"python/tests/*" = [
+    "ANN",
+    "ARG",
+    "BLE001",
+    "D",
+    "S101",
+    "SLF",
+    "PD",
+    "PLR2004",
+    "PT011",
+    "RUF015",
+    "S608",
+    "PLR0913",
+    "PT004",
+]
+"examples/*" = ["D", "W505", "E501", "T201", "S101"]
+"dev/*" = ["D", "E", "T", "S", "PLR", "C", "SIM", "UP", "EXE", "N817"]
+"benchmarks/*" = ["D", "F", "T", "BLE", "FURB", "PLR", "E", "TD", "TRY", "S", "SIM", "EXE", "UP"]
 "docs/*" = ["D"]
 
 [dependency-groups]
 dev = [
     "maturin>=1.8.1",
-    "numpy>1.24.4 ; python_full_version >= '3.10'",
+    "numpy>1.25.0",
     "pytest>=7.4.4",
     "ruff>=0.9.1",
     "toml>=0.10.2",

diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
@@ -48,44 +48,47 @@
 from .io import read_avro, read_csv, read_json, read_parquet
 from .plan import ExecutionPlan, LogicalPlan
 from .record_batch import RecordBatch, RecordBatchStream
-from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF
+from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF, udaf, udf, udwf
 
 __version__ = importlib_metadata.version(__name__)
 
 __all__ = [
     "Accumulator",
+    "AggregateUDF",
+    "Catalog",
     "Config",
-    "DataFrame",
-    "SessionContext",
-    "SessionConfig",
-    "SQLOptions",
-    "RuntimeEnvBuilder",
-    "Expr",
-    "ScalarUDF",
-    "WindowFrame",
-    "column",
-    "col",
-    "literal",
-    "lit",
     "DFSchema",
-    "Catalog",
+    "DataFrame",
     "Database",
-    "Table",
-    "AggregateUDF",
-    "WindowUDF",
-    "LogicalPlan",
     "ExecutionPlan",
+    "Expr",
+    "LogicalPlan",
     "RecordBatch",
     "RecordBatchStream",
+    "RuntimeEnvBuilder",
+    "SQLOptions",
+    "ScalarUDF",
+    "SessionConfig",
+    "SessionContext",
+    "Table",
+    "WindowFrame",
+    "WindowUDF",
+    "col",
+    "column",
     "common",
     "expr",
     "functions",
+    "lit",
+    "literal",
     "object_store",
-    "substrait",
-    "read_parquet",
     "read_avro",
     "read_csv",
     "read_json",
+    "read_parquet",
+    "substrait",
+    "udaf",
+    "udf",
+    "udwf",
 ]
 
 
@@ -120,10 +123,3 @@ def str_lit(value):
 def lit(value):
     """Create a literal expression."""
     return Expr.literal(value)
-
-
-udf = ScalarUDF.udf
-
-udaf = AggregateUDF.udaf
-
-udwf = WindowUDF.udwf
diff --git a/python/datafusion/common.py b/python/datafusion/common.py
@@ -20,7 +20,7 @@
 
 from ._internal import common as common_internal
 
-# TODO these should all have proper wrapper classes
+# TODO: these should all have proper wrapper classes
 
 DFSchema = common_internal.DFSchema
 DataType = common_internal.DataType
@@ -38,15 +38,15 @@
     "DFSchema",
     "DataType",
     "DataTypeMap",
-    "RexType",
-    "PythonType",
-    "SqlType",
     "NullTreatment",
-    "SqlTable",
+    "PythonType",
+    "RexType",
+    "SqlFunction",
     "SqlSchema",
-    "SqlView",
     "SqlStatistics",
-    "SqlFunction",
+    "SqlTable",
+    "SqlType",
+    "SqlView",
 ]
 
 

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -393,8 +393,6 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder:
 class RuntimeConfig(RuntimeEnvBuilder):
     """See `RuntimeEnvBuilder`."""
 
-    pass
-
 
 class SQLOptions:
     """Options to be used when performing SQL queries."""
@@ -498,7 +496,7 @@ def __init__(
 
         self.ctx = SessionContextInternal(config, runtime)
 
-    def enable_url_table(self) -> "SessionContext":
+    def enable_url_table(self) -> SessionContext:
         """Control if local files can be queried as tables.
 
         Returns: