Skip to content

Commit 59fffe3

Browse files
authored
[infra] replace pycln with ruff (apache#1485)
* pre-commit autoupdate * run ruff linter and formatter * remove pycln * ignore some rules * make lint * poetry add ruff --dev * remove ruff from dev dep * git checkout apache/main poetry.lock * add back --exit-non-zero-on-fix
1 parent acd6f5a commit 59fffe3

27 files changed

+1535
-1324
lines changed

.pre-commit-config.yaml

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,26 +28,19 @@ repos:
2828
- id: check-yaml
2929
- id: check-ast
3030
- repo: https://github.com/astral-sh/ruff-pre-commit
31-
# Ruff version (Used for linting)
32-
rev: v0.7.4
31+
rev: v0.8.6
3332
hooks:
3433
- id: ruff
35-
args: [ --fix, --exit-non-zero-on-fix, --preview ]
34+
args: [ --fix, --exit-non-zero-on-fix ]
3635
- id: ruff-format
37-
args: [ --preview ]
3836
- repo: https://github.com/pre-commit/mirrors-mypy
39-
rev: v1.8.0
37+
rev: v1.14.1
4038
hooks:
4139
- id: mypy
4240
args:
4341
[--install-types, --non-interactive, --config=pyproject.toml]
44-
- repo: https://github.com/hadialqattan/pycln
45-
rev: v2.4.0
46-
hooks:
47-
- id: pycln
48-
args: [--config=pyproject.toml]
4942
- repo: https://github.com/igorshubovych/markdownlint-cli
50-
rev: v0.42.0
43+
rev: v0.43.0
5144
hooks:
5245
- id: markdownlint
5346
args: ["--fix"]

pyiceberg/cli/output.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,10 @@ def version(self, version: str) -> None:
242242
self._out({"version": version})
243243

244244
def describe_refs(self, refs: List[Tuple[str, SnapshotRefType, Dict[str, str]]]) -> None:
245-
self._out([
246-
{"name": name, "type": type, detail_key: detail_val}
247-
for name, type, detail in refs
248-
for detail_key, detail_val in detail.items()
249-
])
245+
self._out(
246+
[
247+
{"name": name, "type": type, detail_key: detail_val}
248+
for name, type, detail in refs
249+
for detail_key, detail_val in detail.items()
250+
]
251+
)

pyiceberg/expressions/visitors.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,7 +1228,7 @@ def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool:
12281228
# NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
12291229
return ROWS_MIGHT_MATCH
12301230

1231-
if lower_bound >= literal.value:
1231+
if lower_bound >= literal.value: # type: ignore[operator]
12321232
return ROWS_CANNOT_MATCH
12331233

12341234
return ROWS_MIGHT_MATCH
@@ -1249,7 +1249,7 @@ def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> b
12491249
# NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
12501250
return ROWS_MIGHT_MATCH
12511251

1252-
if lower_bound > literal.value:
1252+
if lower_bound > literal.value: # type: ignore[operator]
12531253
return ROWS_CANNOT_MATCH
12541254

12551255
return ROWS_MIGHT_MATCH
@@ -1266,7 +1266,7 @@ def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool:
12661266

12671267
if upper_bound_bytes := self.upper_bounds.get(field_id):
12681268
upper_bound = from_bytes(field.field_type, upper_bound_bytes)
1269-
if upper_bound <= literal.value:
1269+
if upper_bound <= literal.value: # type: ignore[operator]
12701270
if self._is_nan(upper_bound):
12711271
# NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
12721272
return ROWS_MIGHT_MATCH
@@ -1287,7 +1287,7 @@ def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -
12871287

12881288
if upper_bound_bytes := self.upper_bounds.get(field_id):
12891289
upper_bound = from_bytes(field.field_type, upper_bound_bytes)
1290-
if upper_bound < literal.value:
1290+
if upper_bound < literal.value: # type: ignore[operator]
12911291
if self._is_nan(upper_bound):
12921292
# NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
12931293
return ROWS_MIGHT_MATCH
@@ -1312,7 +1312,7 @@ def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool:
13121312
# NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
13131313
return ROWS_MIGHT_MATCH
13141314

1315-
if lower_bound > literal.value:
1315+
if lower_bound > literal.value: # type: ignore[operator]
13161316
return ROWS_CANNOT_MATCH
13171317

13181318
if upper_bound_bytes := self.upper_bounds.get(field_id):
@@ -1321,7 +1321,7 @@ def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool:
13211321
# NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
13221322
return ROWS_MIGHT_MATCH
13231323

1324-
if upper_bound < literal.value:
1324+
if upper_bound < literal.value: # type: ignore[operator]
13251325
return ROWS_CANNOT_MATCH
13261326

13271327
return ROWS_MIGHT_MATCH
@@ -1349,7 +1349,7 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool:
13491349
# NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
13501350
return ROWS_MIGHT_MATCH
13511351

1352-
literals = {lit for lit in literals if lower_bound <= lit}
1352+
literals = {lit for lit in literals if lower_bound <= lit} # type: ignore[operator]
13531353
if len(literals) == 0:
13541354
return ROWS_CANNOT_MATCH
13551355

@@ -1359,7 +1359,7 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool:
13591359
if self._is_nan(upper_bound):
13601360
return ROWS_MIGHT_MATCH
13611361

1362-
literals = {lit for lit in literals if upper_bound >= lit}
1362+
literals = {lit for lit in literals if upper_bound >= lit} # type: ignore[operator]
13631363
if len(literals) == 0:
13641364
return ROWS_CANNOT_MATCH
13651365

pyiceberg/io/pyarrow.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2449,27 +2449,31 @@ def _dataframe_to_data_files(
24492449
yield from write_file(
24502450
io=io,
24512451
table_metadata=table_metadata,
2452-
tasks=iter([
2453-
WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=task_schema)
2454-
for batches in bin_pack_arrow_table(df, target_file_size)
2455-
]),
2452+
tasks=iter(
2453+
[
2454+
WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=task_schema)
2455+
for batches in bin_pack_arrow_table(df, target_file_size)
2456+
]
2457+
),
24562458
)
24572459
else:
24582460
partitions = _determine_partitions(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df)
24592461
yield from write_file(
24602462
io=io,
24612463
table_metadata=table_metadata,
2462-
tasks=iter([
2463-
WriteTask(
2464-
write_uuid=write_uuid,
2465-
task_id=next(counter),
2466-
record_batches=batches,
2467-
partition_key=partition.partition_key,
2468-
schema=task_schema,
2469-
)
2470-
for partition in partitions
2471-
for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size)
2472-
]),
2464+
tasks=iter(
2465+
[
2466+
WriteTask(
2467+
write_uuid=write_uuid,
2468+
task_id=next(counter),
2469+
record_batches=batches,
2470+
partition_key=partition.partition_key,
2471+
schema=task_schema,
2472+
)
2473+
for partition in partitions
2474+
for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size)
2475+
]
2476+
),
24732477
)
24742478

24752479

@@ -2534,10 +2538,12 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
25342538
partition_columns: List[Tuple[PartitionField, NestedField]] = [
25352539
(partition_field, schema.find_field(partition_field.source_id)) for partition_field in spec.fields
25362540
]
2537-
partition_values_table = pa.table({
2538-
str(partition.field_id): partition.transform.pyarrow_transform(field.field_type)(arrow_table[field.name])
2539-
for partition, field in partition_columns
2540-
})
2541+
partition_values_table = pa.table(
2542+
{
2543+
str(partition.field_id): partition.transform.pyarrow_transform(field.field_type)(arrow_table[field.name])
2544+
for partition, field in partition_columns
2545+
}
2546+
)
25412547

25422548
# Sort by partitions
25432549
sort_indices = pa.compute.sort_indices(

pyiceberg/manifest.py

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -292,28 +292,32 @@ def __repr__(self) -> str:
292292

293293

294294
def data_file_with_partition(partition_type: StructType, format_version: TableVersion) -> StructType:
295-
data_file_partition_type = StructType(*[
296-
NestedField(
297-
field_id=field.field_id,
298-
name=field.name,
299-
field_type=field.field_type,
300-
required=field.required,
301-
)
302-
for field in partition_type.fields
303-
])
295+
data_file_partition_type = StructType(
296+
*[
297+
NestedField(
298+
field_id=field.field_id,
299+
name=field.name,
300+
field_type=field.field_type,
301+
required=field.required,
302+
)
303+
for field in partition_type.fields
304+
]
305+
)
304306

305-
return StructType(*[
306-
NestedField(
307-
field_id=102,
308-
name="partition",
309-
field_type=data_file_partition_type,
310-
required=True,
311-
doc="Partition data tuple, schema based on the partition spec",
312-
)
313-
if field.field_id == 102
314-
else field
315-
for field in DATA_FILE_TYPE[format_version].fields
316-
])
307+
return StructType(
308+
*[
309+
NestedField(
310+
field_id=102,
311+
name="partition",
312+
field_type=data_file_partition_type,
313+
required=True,
314+
doc="Partition data tuple, schema based on the partition spec",
315+
)
316+
if field.field_id == 102
317+
else field
318+
for field in DATA_FILE_TYPE[format_version].fields
319+
]
320+
)
317321

318322

319323
class DataFile(Record):
@@ -398,10 +402,12 @@ def __eq__(self, other: Any) -> bool:
398402

399403

400404
def manifest_entry_schema_with_data_file(format_version: TableVersion, data_file: StructType) -> Schema:
401-
return Schema(*[
402-
NestedField(2, "data_file", data_file, required=True) if field.field_id == 2 else field
403-
for field in MANIFEST_ENTRY_SCHEMAS[format_version].fields
404-
])
405+
return Schema(
406+
*[
407+
NestedField(2, "data_file", data_file, required=True) if field.field_id == 2 else field
408+
for field in MANIFEST_ENTRY_SCHEMAS[format_version].fields
409+
]
410+
)
405411

406412

407413
class ManifestEntry(Record):

pyiceberg/schema.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,12 +1707,14 @@ def list(self, list_type: ListType, element_result: Callable[[], bool]) -> bool:
17071707
return self._is_field_compatible(list_type.element_field) and element_result()
17081708

17091709
def map(self, map_type: MapType, key_result: Callable[[], bool], value_result: Callable[[], bool]) -> bool:
1710-
return all([
1711-
self._is_field_compatible(map_type.key_field),
1712-
self._is_field_compatible(map_type.value_field),
1713-
key_result(),
1714-
value_result(),
1715-
])
1710+
return all(
1711+
[
1712+
self._is_field_compatible(map_type.key_field),
1713+
self._is_field_compatible(map_type.value_field),
1714+
key_result(),
1715+
value_result(),
1716+
]
1717+
)
17161718

17171719
def primitive(self, primitive: PrimitiveType) -> bool:
17181720
return True

pyiceberg/table/__init__.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -629,18 +629,20 @@ def delete(
629629
if len(filtered_df) == 0:
630630
replaced_files.append((original_file.file, []))
631631
elif len(df) != len(filtered_df):
632-
replaced_files.append((
633-
original_file.file,
634-
list(
635-
_dataframe_to_data_files(
636-
io=self._table.io,
637-
df=filtered_df,
638-
table_metadata=self.table_metadata,
639-
write_uuid=commit_uuid,
640-
counter=counter,
641-
)
642-
),
643-
))
632+
replaced_files.append(
633+
(
634+
original_file.file,
635+
list(
636+
_dataframe_to_data_files(
637+
io=self._table.io,
638+
df=filtered_df,
639+
table_metadata=self.table_metadata,
640+
write_uuid=commit_uuid,
641+
counter=counter,
642+
)
643+
),
644+
)
645+
)
644646

645647
if len(replaced_files) > 0:
646648
with self.update_snapshot(snapshot_properties=snapshot_properties).overwrite() as overwrite_snapshot:
@@ -680,9 +682,9 @@ def add_files(
680682
raise ValueError(f"Cannot add files that are already referenced by table, files: {', '.join(referenced_files)}")
681683

682684
if self.table_metadata.name_mapping() is None:
683-
self.set_properties(**{
684-
TableProperties.DEFAULT_NAME_MAPPING: self.table_metadata.schema().name_mapping.model_dump_json()
685-
})
685+
self.set_properties(
686+
**{TableProperties.DEFAULT_NAME_MAPPING: self.table_metadata.schema().name_mapping.model_dump_json()}
687+
)
686688
with self.update_snapshot(snapshot_properties=snapshot_properties).fast_append() as update_snapshot:
687689
data_files = _parquet_files_to_data_files(
688690
table_metadata=self.table_metadata, file_paths=file_paths, io=self._table.io

0 commit comments

Comments
 (0)