Skip to content

Commit 8468086

Browse files
Fokkosungwy
authored andcommitted
Remove numpy dependency (apache#1270)
* Remove numpy as a hard dependency With Arrow 18.0.0 numpy is not a dependency anymore: apache/arrow#44148 I think it would be good to also remove it from PyIceberg * Add link to issue
1 parent 3a68652 commit 8468086

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@
5757
)
5858
from urllib.parse import urlparse
5959

60-
import numpy as np
6160
import pyarrow as pa
6261
import pyarrow.compute as pc
6362
import pyarrow.dataset as ds
@@ -812,7 +811,17 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start
812811
all_chunks = positional_deletes[0]
813812
else:
814813
all_chunks = pa.chunked_array(itertools.chain(*[arr.chunks for arr in positional_deletes]))
815-
return np.subtract(np.setdiff1d(np.arange(start_index, end_index), all_chunks, assume_unique=False), start_index)
814+
815+
# Create the full range array with pyarrow
816+
full_range = pa.array(range(start_index, end_index))
817+
# When available, replace with Arrow generator to improve performance
818+
# See https://github.com/apache/iceberg-python/issues/1271 for details
819+
820+
# Filter out values in all_chunks from full_range
821+
result = pc.filter(full_range, pc.invert(pc.is_in(full_range, value_set=all_chunks)))
822+
823+
# Subtract the start_index from each element in the result
824+
return pc.subtract(result, pa.scalar(start_index))
816825

817826

818827
def pyarrow_to_schema(

0 commit comments

Comments
 (0)