Skip to content

Commit 86a9c1f

Browse files
committed
Merge branch-25.06 into branch-25.08
2 parents 5c11916 + f43bad0 commit 86a9c1f

24 files changed

+233
-123
lines changed

conda/environments/all_cuda-118_arch-aarch64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ dependencies:
6666
- pandas
6767
- pandas>=2.0,<2.2.4dev0
6868
- pandoc
69-
- polars>=1.24,<1.28
69+
- polars>=1.25,<1.29
7070
- pre-commit
7171
- ptxcompiler
7272
- pyarrow>=14.0.0,<20.0.0a0

conda/environments/all_cuda-118_arch-x86_64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ dependencies:
6868
- pandas
6969
- pandas>=2.0,<2.2.4dev0
7070
- pandoc
71-
- polars>=1.24,<1.28
71+
- polars>=1.25,<1.29
7272
- pre-commit
7373
- ptxcompiler
7474
- pyarrow>=14.0.0,<20.0.0a0

conda/environments/all_cuda-128_arch-aarch64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ dependencies:
6565
- pandas
6666
- pandas>=2.0,<2.2.4dev0
6767
- pandoc
68-
- polars>=1.24,<1.28
68+
- polars>=1.25,<1.29
6969
- pre-commit
7070
- pyarrow>=14.0.0,<20.0.0a0
7171
- pydata-sphinx-theme>=0.15.4

conda/environments/all_cuda-128_arch-x86_64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ dependencies:
6666
- pandas
6767
- pandas>=2.0,<2.2.4dev0
6868
- pandoc
69-
- polars>=1.24,<1.28
69+
- polars>=1.25,<1.29
7070
- pre-commit
7171
- pyarrow>=14.0.0,<20.0.0a0
7272
- pydata-sphinx-theme>=0.15.4

dependencies.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -831,7 +831,7 @@ dependencies:
831831
common:
832832
- output_types: [conda, requirements, pyproject]
833833
packages:
834-
- polars>=1.24,<1.28
834+
- polars>=1.25,<1.29
835835
run_cudf_polars_experimental:
836836
common:
837837
- output_types: [conda, requirements, pyproject]

python/cudf_polars/cudf_polars/callback.py

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323

2424
from cudf_polars.dsl.translate import Translator
2525
from cudf_polars.utils.timer import Timer
26-
from cudf_polars.utils.versions import POLARS_VERSION_LT_125
2726

2827
if TYPE_CHECKING:
2928
from collections.abc import Generator
@@ -308,28 +307,12 @@ def execute_with_cudf(
308307
if translator.config_options.raise_on_fail:
309308
raise exception
310309
else:
311-
if POLARS_VERSION_LT_125: # pragma: no cover
312-
nt.set_udf(
313-
partial(
314-
_callback,
315-
ir,
316-
should_time=False,
317-
memory_resource=memory_resource,
318-
config_options=translator.config_options,
319-
timer=None,
320-
)
310+
nt.set_udf(
311+
partial(
312+
_callback,
313+
ir,
314+
memory_resource=memory_resource,
315+
config_options=translator.config_options,
316+
timer=timer,
321317
)
322-
else:
323-
nt.set_udf(
324-
partial(
325-
_callback,
326-
ir,
327-
memory_resource=memory_resource,
328-
config_options=translator.config_options,
329-
timer=timer,
330-
)
331-
)
332-
333-
334-
if POLARS_VERSION_LT_125: # pragma: no cover
335-
execute_with_cudf = partial(execute_with_cudf, duration_since_start=None)
318+
)

python/cudf_polars/cudf_polars/containers/dataframe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
if TYPE_CHECKING:
1919
from collections.abc import Iterable, Mapping, Sequence, Set
2020

21-
from typing_extensions import Self
21+
from typing_extensions import Any, Self
2222

2323
from cudf_polars.typing import ColumnOptions, DataFrameHeader, Slice
2424

@@ -270,7 +270,7 @@ def discard_columns(self, names: Set[str]) -> Self:
270270
"""Drop columns by name."""
271271
return type(self)(column for column in self.columns if column.name not in names)
272272

273-
def select(self, names: Sequence[str]) -> Self:
273+
def select(self, names: Sequence[str] | Mapping[str, Any]) -> Self:
274274
"""Select columns by name returning DataFrame."""
275275
try:
276276
return type(self)(self.column_map[name] for name in names)

python/cudf_polars/cudf_polars/dsl/expressions/boolean.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
ExecutionContext,
2020
Expr,
2121
)
22+
from cudf_polars.utils.versions import POLARS_VERSION_LT_128
2223

2324
if TYPE_CHECKING:
2425
from typing_extensions import Self
@@ -87,9 +88,11 @@ def __init__(
8788
BooleanFunction.Name.IsLastDistinct,
8889
BooleanFunction.Name.IsUnique,
8990
)
90-
if self.name is BooleanFunction.Name.IsIn and not all(
91-
c.dtype == self.children[0].dtype for c in self.children
92-
):
91+
if (
92+
POLARS_VERSION_LT_128
93+
and self.name is BooleanFunction.Name.IsIn
94+
and not all(c.dtype == self.children[0].dtype for c in self.children)
95+
): # pragma: no cover
9396
# TODO: If polars IR doesn't put the casts in, we need to
9497
# mimic the supertype promotion rules.
9598
raise NotImplementedError("IsIn doesn't support supertype casting")

python/cudf_polars/cudf_polars/dsl/expressions/unary.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
1414
from cudf_polars.dsl.expressions.literal import Literal
1515
from cudf_polars.utils import dtypes
16+
from cudf_polars.utils.versions import POLARS_VERSION_LT_128
1617

1718
if TYPE_CHECKING:
1819
from cudf_polars.containers import DataFrame
@@ -233,6 +234,14 @@ def do_evaluate(
233234
else:
234235
evaluated = self.children[1].evaluate(df, context=context)
235236
arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
237+
if (
238+
not POLARS_VERSION_LT_128
239+
and isinstance(arg, plc.Scalar)
240+
and dtypes.can_cast(column.obj.type(), arg.type())
241+
): # pragma: no cover
242+
arg = plc.unary.cast(
243+
plc.Column.from_scalar(arg, 1), column.obj.type()
244+
).to_scalar()
236245
return Column(plc.replace.replace_nulls(column.obj, arg))
237246
elif self.name in self._OP_MAPPING:
238247
column = self.children[0].evaluate(df, context=context)

python/cudf_polars/cudf_polars/dsl/ir.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from cudf_polars.dsl.nodebase import Node
3434
from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter
3535
from cudf_polars.utils import dtypes
36+
from cudf_polars.utils.versions import POLARS_VERSION_LT_128
3637

3738
if TYPE_CHECKING:
3839
from collections.abc import Callable, Hashable, Iterable, Sequence
@@ -373,7 +374,9 @@ def __init__(
373374
# TODO: polars has this implemented for parquet,
374375
# maybe we can do this too?
375376
raise NotImplementedError("slice pushdown for negative slices")
376-
if self.typ in {"csv"} and self.skip_rows != 0: # pragma: no cover
377+
if (
378+
POLARS_VERSION_LT_128 and self.typ in {"csv"} and self.skip_rows != 0
379+
): # pragma: no cover
377380
# This comes from slice pushdown, but that
378381
# optimization doesn't happen right now
379382
raise NotImplementedError("skipping rows in CSV reader")
@@ -383,7 +386,7 @@ def __init__(
383386
raise NotImplementedError(
384387
"Read from cloud storage"
385388
) # pragma: no cover; no test yet
386-
if any(p.startswith("https://") for p in self.paths):
389+
if any(str(p).startswith("https:/") for p in self.paths):
387390
raise NotImplementedError("Read from https")
388391
if self.typ == "csv":
389392
if self.reader_options["skip_rows_after_header"] != 0:
@@ -459,7 +462,8 @@ def add_file_paths(
459462
Each path is repeated according to the number of rows read from it.
460463
"""
461464
(filepaths,) = plc.filling.repeat(
462-
plc.Table([plc.interop.from_arrow(pa.array(paths))]),
465+
# TODO: Remove call from_arrow when we support python list to Column
466+
plc.Table([plc.interop.from_arrow(pa.array(map(str, paths)))]),
463467
plc.interop.from_arrow(pa.array(rows_per_path, type=pa.int32())),
464468
).columns()
465469
return df.with_columns([Column(filepaths, name=name)])
@@ -481,6 +485,17 @@ def do_evaluate(
481485
) -> DataFrame:
482486
"""Evaluate and return a dataframe."""
483487
if typ == "csv":
488+
489+
def read_csv_header(
490+
path: Path | str, sep: str
491+
) -> list[str]: # pragma: no cover
492+
with Path(path).open() as f:
493+
for line in f:
494+
stripped = line.strip()
495+
if stripped:
496+
return stripped.split(sep)
497+
return []
498+
484499
parse_options = reader_options["parse_options"]
485500
sep = chr(parse_options["separator"])
486501
quote = chr(parse_options["quote_char"])
@@ -524,7 +539,9 @@ def do_evaluate(
524539
options = (
525540
plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([path]))
526541
.nrows(n_rows)
527-
.skiprows(skiprows)
542+
.skiprows(
543+
skiprows if POLARS_VERSION_LT_128 else skiprows + skip_rows
544+
) # pragma: no cover
528545
.lineterminator(str(eol))
529546
.quotechar(str(quote))
530547
.decimal(decimal)
@@ -535,6 +552,13 @@ def do_evaluate(
535552
options.set_delimiter(str(sep))
536553
if column_names is not None:
537554
options.set_names([str(name) for name in column_names])
555+
else:
556+
if (
557+
not POLARS_VERSION_LT_128 and skip_rows > header
558+
): # pragma: no cover
559+
# We need to read the header otherwise we would skip it
560+
column_names = read_csv_header(path, str(sep))
561+
options.set_names(column_names)
538562
options.set_header(header)
539563
options.set_dtypes(schema)
540564
if usecols is not None:
@@ -691,6 +715,8 @@ def slice_skip(tbl: plc.Table) -> plc.Table:
691715
name=name,
692716
)
693717
df = DataFrame([index_col, *df.columns])
718+
if next(iter(schema)) != name:
719+
df = df.select(schema)
694720
assert all(c.obj.type() == schema[name] for name, c in df.column_map.items())
695721
if predicate is None:
696722
return df

0 commit comments

Comments
 (0)