More ruff updates

timsaucer · timsaucer · commit 694851da0213 · 2025-03-09T11:32:13.000-04:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -67,7 +67,7 @@ features = ["substrait"]
 # Enable docstring linting using the google style guide
 [tool.ruff.lint]
 select = ["ALL" ]
-ignore = ["COM812", "ISC001"]  # Recommended to ignore these rules when using with ruff-format
+ignore = ["COM812", "ISC001", "TD002"]  # Recommended to ignore these rules when using with ruff-format
 
 [tool.ruff.lint.pydocstyle]
 convention = "google"
@@ -77,7 +77,20 @@ max-doc-length = 88
 
 # Disable docstring checking for these directories
 [tool.ruff.lint.per-file-ignores]
-"python/tests/*" = ["ANN", "ARG", "D", "S101", "SLF", "PD", "PLR2004", "PT011", "RUF015", "S608", "PLR0913"]
+"python/tests/*" = [
+    "ANN",
+    "ARG",
+    "BLE001",
+    "D",
+    "S101",
+    "SLF",
+    "PD",
+    "PLR2004",
+    "PT011",
+    "RUF015",
+    "S608",
+    "PLR0913"
+]
 "examples/*" = ["D", "W505"]
 "dev/*" = ["D"]
 "benchmarks/*" = ["D", "F"]
diff --git a/python/tests/test_context.py b/python/tests/test_context.py
@@ -16,7 +16,6 @@
 # under the License.
 import datetime as dt
 import gzip
-import os
 import pathlib
 
 import pyarrow as pa
@@ -45,7 +44,7 @@ def test_create_context_runtime_config_only():
     SessionContext(runtime=RuntimeEnvBuilder())
 
 
-@pytest.mark.parametrize("path_to_str", (True, False))
+@pytest.mark.parametrize("path_to_str", [True, False])
 def test_runtime_configs(tmp_path, path_to_str):
     path1 = tmp_path / "dir1"
     path2 = tmp_path / "dir2"
@@ -62,7 +61,7 @@ def test_runtime_configs(tmp_path, path_to_str):
     assert db is not None
 
 
-@pytest.mark.parametrize("path_to_str", (True, False))
+@pytest.mark.parametrize("path_to_str", [True, False])
 def test_temporary_files(tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -79,14 +78,14 @@ def test_create_context_with_all_valid_args():
     runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000)
     config = (
         SessionConfig()
-        .with_create_default_catalog_and_schema(True)
+        .with_create_default_catalog_and_schema(enabled=True)
         .with_default_catalog_and_schema("foo", "bar")
         .with_target_partitions(1)
-        .with_information_schema(True)
-        .with_repartition_joins(False)
-        .with_repartition_aggregations(False)
-        .with_repartition_windows(False)
-        .with_parquet_pruning(False)
+        .with_information_schema(enabled=True)
+        .with_repartition_joins(enabled=False)
+        .with_repartition_aggregations(enabled=False)
+        .with_repartition_windows(enabled=False)
+        .with_parquet_pruning(enabled=False)
     )
 
     ctx = SessionContext(config, runtime)
@@ -167,7 +166,7 @@ def test_from_arrow_table(ctx):
 
 def record_batch_generator(num_batches: int):
     schema = pa.schema([("a", pa.int64()), ("b", pa.int64())])
-    for i in range(num_batches):
+    for _i in range(num_batches):
         yield pa.RecordBatch.from_arrays(
             [pa.array([1, 2, 3]), pa.array([4, 5, 6])], schema=schema
         )
@@ -492,10 +491,10 @@ def test_table_not_found(ctx):
 
 
 def test_read_json(ctx):
-    path = os.path.dirname(os.path.abspath(__file__))
+    path = pathlib.Path(__file__).parent.resolve()
 
     # Default
-    test_data_path = os.path.join(path, "data_test_context", "data.json")
+    test_data_path = path / "data_test_context" / "data.json"
     df = ctx.read_json(test_data_path)
     result = df.collect()
 
@@ -515,7 +514,7 @@ def test_read_json(ctx):
     assert result[0].schema == schema
 
     # File extension
-    test_data_path = os.path.join(path, "data_test_context", "data.json")
+    test_data_path = path / "data_test_context" / "data.json"
     df = ctx.read_json(test_data_path, file_extension=".json")
     result = df.collect()
 
@@ -524,15 +523,16 @@ def test_read_json(ctx):
 
 
 def test_read_json_compressed(ctx, tmp_path):
-    path = os.path.dirname(os.path.abspath(__file__))
-    test_data_path = os.path.join(path, "data_test_context", "data.json")
+    path = pathlib.Path(__file__).parent.resolve()
+    test_data_path = path / "data_test_context" / "data.json"
 
     # File compression type
     gzip_path = tmp_path / "data.json.gz"
 
-    with open(test_data_path, "rb") as csv_file:
-        with gzip.open(gzip_path, "wb") as gzipped_file:
-            gzipped_file.writelines(csv_file)
+    with pathlib.Path.open(test_data_path, "rb") as csv_file, gzip.open(
+        gzip_path, "wb"
+    ) as gzipped_file:
+        gzipped_file.writelines(csv_file)
 
     df = ctx.read_json(gzip_path, file_extension=".gz", file_compression_type="gz")
     result = df.collect()
@@ -563,14 +563,15 @@ def test_read_csv_list(ctx):
 
 
 def test_read_csv_compressed(ctx, tmp_path):
-    test_data_path = "testing/data/csv/aggregate_test_100.csv"
+    test_data_path = pathlib.Path("testing/data/csv/aggregate_test_100.csv")
 
     # File compression type
     gzip_path = tmp_path / "aggregate_test_100.csv.gz"
 
-    with open(test_data_path, "rb") as csv_file:
-        with gzip.open(gzip_path, "wb") as gzipped_file:
-            gzipped_file.writelines(csv_file)
+    with pathlib.Path.open(test_data_path, "rb") as csv_file, gzip.open(
+        gzip_path, "wb"
+    ) as gzipped_file:
+        gzipped_file.writelines(csv_file)
 
     csv_df = ctx.read_csv(gzip_path, file_extension=".gz", file_compression_type="gz")
     csv_df.select(column("c1")).show()
@@ -603,7 +604,7 @@ def test_create_sql_options():
 def test_sql_with_options_no_ddl(ctx):
     sql = "CREATE TABLE IF NOT EXISTS valuetable AS VALUES(1,'HELLO'),(12,'DATAFUSION')"
     ctx.sql(sql)
-    options = SQLOptions().with_allow_ddl(False)
+    options = SQLOptions().with_allow_ddl(allow=False)
     with pytest.raises(Exception, match="DDL"):
         ctx.sql_with_options(sql, options=options)
 
@@ -618,14 +619,14 @@ def test_sql_with_options_no_dml(ctx):
     ctx.register_dataset(table_name, dataset)
     sql = f'INSERT INTO "{table_name}" VALUES (1, 2), (2, 3);'
     ctx.sql(sql)
-    options = SQLOptions().with_allow_dml(False)
+    options = SQLOptions().with_allow_dml(allow=False)
     with pytest.raises(Exception, match="DML"):
         ctx.sql_with_options(sql, options=options)
 
 
 def test_sql_with_options_no_statements(ctx):
     sql = "SET time zone = 1;"
     ctx.sql(sql)
-    options = SQLOptions().with_allow_statements(False)
+    options = SQLOptions().with_allow_statements(allow=False)
     with pytest.raises(Exception, match="SetVariable"):
         ctx.sql_with_options(sql, options=options)
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -339,7 +339,7 @@ def test_join():
 
     # Verify we don't make a breaking change to pre-43.0.0
     # where users would pass join_keys as a positional argument
-    df2 = df.join(df1, (["a"], ["a"]), how="inner")  # type: ignore
+    df2 = df.join(df1, (["a"], ["a"]), how="inner")
     df2.show()
     df2 = df2.sort(column("l.a"))
     table = pa.Table.from_batches(df2.collect())
@@ -375,17 +375,17 @@ def test_join_invalid_params():
     with pytest.raises(
         ValueError, match=r"`left_on` or `right_on` should not provided with `on`"
     ):
-        df2 = df.join(df1, on="a", how="inner", right_on="test")  # type: ignore
+        df2 = df.join(df1, on="a", how="inner", right_on="test")
 
     with pytest.raises(
         ValueError, match=r"`left_on` and `right_on` should both be provided."
     ):
-        df2 = df.join(df1, left_on="a", how="inner")  # type: ignore
+        df2 = df.join(df1, left_on="a", how="inner")
 
     with pytest.raises(
         ValueError, match=r"either `on` or `left_on` and `right_on` should be provided."
     ):
-        df2 = df.join(df1, how="inner")  # type: ignore
+        df2 = df.join(df1, how="inner")
 
 
 def test_join_on():
@@ -567,7 +567,7 @@ def test_distinct():
 ]
 
 
-@pytest.mark.parametrize("name,expr,result", data_test_window_functions)
+@pytest.mark.parametrize(("name", "expr", "result"), data_test_window_functions)
 def test_window_functions(partitioned_df, name, expr, result):
     df = partitioned_df.select(
         column("a"), column("b"), column("c"), f.alias(expr, name)
@@ -885,7 +885,7 @@ def test_union_distinct(ctx):
     )
     df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
 
-    df_a_u_b = df_a.union(df_b, True).sort(column("a"))
+    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
 
     assert df_c.collect() == df_a_u_b.collect()
     assert df_c.collect() == df_a_u_b.collect()
@@ -954,8 +954,6 @@ def test_to_arrow_table(df):
 
 def test_execute_stream(df):
     stream = df.execute_stream()
-    for s in stream:
-        print(type(s))
     assert all(batch is not None for batch in stream)
     assert not list(stream)  # after one iteration the generator must be exhausted
 
@@ -1033,7 +1031,7 @@ def test_describe(df):
     }
 
 
-@pytest.mark.parametrize("path_to_str", (True, False))
+@pytest.mark.parametrize("path_to_str", [True, False])
 def test_write_csv(ctx, df, tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -1046,7 +1044,7 @@ def test_write_csv(ctx, df, tmp_path, path_to_str):
     assert result == expected
 
 
-@pytest.mark.parametrize("path_to_str", (True, False))
+@pytest.mark.parametrize("path_to_str", [True, False])
 def test_write_json(ctx, df, tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -1059,7 +1057,7 @@ def test_write_json(ctx, df, tmp_path, path_to_str):
     assert result == expected
 
 
-@pytest.mark.parametrize("path_to_str", (True, False))
+@pytest.mark.parametrize("path_to_str", [True, False])
 def test_write_parquet(df, tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -1071,7 +1069,7 @@ def test_write_parquet(df, tmp_path, path_to_str):
 
 
 @pytest.mark.parametrize(
-    "compression, compression_level",
+    ("compression", "compression_level"),
     [("gzip", 6), ("brotli", 7), ("zstd", 15)],
 )
 def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
@@ -1082,7 +1080,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
     )
 
     # test that the actual compression scheme is the one written
-    for root, dirs, files in os.walk(path):
+    for _root, _dirs, files in os.walk(path):
         for file in files:
             if file.endswith(".parquet"):
                 metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
@@ -1097,7 +1095,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
 
 
 @pytest.mark.parametrize(
-    "compression, compression_level",
+    ("compression", "compression_level"),
     [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
 )
 def test_write_compressed_parquet_wrong_compression_level(
diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py
@@ -85,18 +85,14 @@ def test_limit(test_ctx):
 
     plan = plan.to_variant()
     assert isinstance(plan, Limit)
-    # TODO: Upstream now has expressions for skip and fetch
-    # REF: https://github.com/apache/datafusion/pull/12836
-    # assert plan.skip() == 0
+    assert "Skip: None" in str(plan)
 
     df = test_ctx.sql("select c1 from test LIMIT 10 OFFSET 5")
     plan = df.logical_plan()
 
     plan = plan.to_variant()
     assert isinstance(plan, Limit)
-    # TODO: Upstream now has expressions for skip and fetch
-    # REF: https://github.com/apache/datafusion/pull/12836
-    # assert plan.skip() == 5
+    assert "Skip: Some(Literal(Int64(5)))" in str(plan)
 
 
 def test_aggregate_query(test_ctx):
@@ -165,6 +161,7 @@ def traverse_logical_plan(plan):
                 res = traverse_logical_plan(input_plan)
                 if res is not None:
                     return res
+        return None
 
     ctx = SessionContext()
     data = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
@@ -176,7 +173,7 @@ def traverse_logical_plan(plan):
     assert variant.expr().to_variant().qualified_name() == "table1.name"
     assert (
         str(variant.list())
-        == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]'
+        == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]'  # noqa: E501
     )
     assert not variant.negated()