Skip to content
Open
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1bd4269
[Data] Add map namespace support for expression operations
ryankert01 Jan 6, 2026
2e157bd
Merge branch 'master' into map-expression
ryankert01 Jan 6, 2026
68bef64
address ai review
ryankert01 Jan 6, 2026
fe2642b
fix cursor bot suggestions
ryankert01 Jan 6, 2026
843cac1
Merge branch 'master' into map-expression
ryankert01 Jan 6, 2026
f16bfd1
Merge remote-tracking branch 'origin/master' into map-expression
ryankert01 Jan 12, 2026
df1fe8c
refactor tests
ryankert01 Jan 12, 2026
6461062
Merge branch 'master' into map-expression
ryankert01 Jan 12, 2026
fcd3652
Merge branch 'master' into map-expression
ryankert01 Jan 18, 2026
202a652
Merge branch 'master' into map-expression
owenowenisme Jan 21, 2026
50a2e64
Update python/ray/data/namespace_expressions/map_namespace.py
ryankert01 Jan 22, 2026
e613cfa
address commits
ryankert01 Jan 22, 2026
70a3760
Merge branch 'master' into map-expression
ryankert01 Jan 22, 2026
49268ec
Merge branch 'master' into map-expression
ryankert01 Jan 25, 2026
c390a24
create 3 helper functions to make the intent clearer
ryankert01 Jan 25, 2026
5e024c8
use numpy.repeat()
ryankert01 Jan 25, 2026
10e4b7c
text extractioon on empty chunkedArray
ryankert01 Jan 25, 2026
f9d53b8
Merge branch 'master' into map-expression
ryankert01 Jan 25, 2026
978132e
lint
ryankert01 Jan 25, 2026
2eff519
Merge remote-tracking branch 'origin/map-expression' into map-expression
ryankert01 Jan 25, 2026
7a11478
Merge branch 'master' into map-expression
goutamvenkat-anyscale Feb 4, 2026
dae4645
Merge branch 'master' into map-expression
ryankert01 Feb 8, 2026
59f8047
address comments
ryankert01 Feb 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions python/ray/data/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace
from ray.data.namespace_expressions.dt_namespace import _DatetimeNamespace
from ray.data.namespace_expressions.list_namespace import _ListNamespace
from ray.data.namespace_expressions.map_namespace import _MapNamespace
from ray.data.namespace_expressions.string_namespace import _StringNamespace
from ray.data.namespace_expressions.struct_namespace import _StructNamespace

Expand Down Expand Up @@ -656,6 +657,13 @@ def struct(self) -> "_StructNamespace":

return _StructNamespace(self)

@property
def map(self) -> "_MapNamespace":
"""Access map/dict operations for this expression."""
from ray.data.namespace_expressions.map_namespace import _MapNamespace

return _MapNamespace(self)

@property
def dt(self) -> "_DatetimeNamespace":
"""Access datetime operations for this expression."""
Expand Down Expand Up @@ -1525,6 +1533,7 @@ def download(
"_ListNamespace",
"_StringNamespace",
"_StructNamespace",
"_MapNamespace",
"_DatetimeNamespace",
]

Expand All @@ -1547,6 +1556,10 @@ def __getattr__(name: str):
from ray.data.namespace_expressions.struct_namespace import _StructNamespace

return _StructNamespace
elif name == "_MapNamespace":
from ray.data.namespace_expressions.map_namespace import _MapNamespace

return _MapNamespace
elif name == "_DatetimeNamespace":
from ray.data.namespace_expressions.dt_namespace import _DatetimeNamespace

Expand Down
207 changes: 207 additions & 0 deletions python/ray/data/namespace_expressions/map_namespace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
from __future__ import annotations

from dataclasses import dataclass
from enum import Enum
from typing import TYPE_CHECKING

import numpy as np
import pyarrow
import pyarrow.compute as pc

from ray.data.datatype import DataType
from ray.data.expressions import pyarrow_udf

if TYPE_CHECKING:
from ray.data.expressions import Expr, UDFExpr


class MapComponent(str, Enum):
KEYS = "keys"
VALUES = "values"


def _get_child_array(
arr: pyarrow.Array, component: MapComponent
) -> pyarrow.Array | None:
"""Extract the flat keys or values array from a map-like array.

Example: MapArray [{"a": 1}, {"b": 2}] -> keys ["a", "b"] or values [1, 2]
"""
if isinstance(arr, pyarrow.MapArray):
if component == MapComponent.KEYS:
return arr.keys
else:
return arr.items

if isinstance(arr, (pyarrow.ListArray, pyarrow.LargeListArray)):
flat_values = arr.values
if (
isinstance(flat_values, pyarrow.StructArray)
and flat_values.type.num_fields >= 2
):
idx = 0 if component == MapComponent.KEYS else 1
return flat_values.field(idx)

return None


def _make_empty_list_array(
arr: pyarrow.Array, component: MapComponent
) -> pyarrow.Array:
"""Create an all-null ListArray matching the input length.

Example: arr of length 3 -> ListArray [null, null, null]
"""
if len(arr) > 0 and arr.null_count < len(arr):
raise TypeError(
f"Expression is not a valid map type. .map.{component.value}() requires "
f"pyarrow.MapArray or pyarrow.ListArray<Struct> with at least 2 fields "
f"(key and value), but got: {arr.type}."
)
return pyarrow.ListArray.from_arrays(
offsets=np.repeat(0, len(arr) + 1),
values=pyarrow.array([], type=pyarrow.null()),
mask=pyarrow.array(np.repeat(True, len(arr))),
)


def _rebuild_list_array(
arr: pyarrow.Array, child_array: pyarrow.Array
) -> pyarrow.Array:
"""Rebuild a ListArray from parent offsets and child values, normalizing sliced offsets.

Example: offsets [5, 7, 10] -> slice child to [5:10], normalize offsets to [0, 2, 5]
"""
offsets = arr.offsets
if len(offsets) > 0:
start_offset = offsets[0]
if start_offset.as_py() != 0:
end_offset = offsets[-1].as_py()
child_array = child_array.slice(
offset=start_offset.as_py(), length=end_offset - start_offset.as_py()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't believe you need to call as_py here

)
offsets = pc.subtract(offsets, start_offset)

return pyarrow.ListArray.from_arrays(
offsets=offsets, values=child_array, mask=arr.is_null()
)


def _get_result_type(
arr_type: pyarrow.DataType, component: MapComponent
) -> pyarrow.DataType:
"""Infer the result list type from the input map type."""
if pyarrow.types.is_map(arr_type):
inner = (
arr_type.key_type if component == MapComponent.KEYS else arr_type.item_type
)
return pyarrow.list_(inner)
if pyarrow.types.is_list(arr_type) or pyarrow.types.is_large_list(arr_type):
struct_type = arr_type.value_type
if pyarrow.types.is_struct(struct_type) and struct_type.num_fields >= 2:
idx = 0 if component == MapComponent.KEYS else 1
return pyarrow.list_(struct_type.field(idx).type)
return pyarrow.list_(pyarrow.null())


def _extract_map_component(
arr: pyarrow.Array, component: MapComponent
) -> pyarrow.Array:
"""Extract keys or values from a MapArray or ListArray<Struct>.

This serves as the primary implementation since PyArrow does not yet
expose dedicated compute kernels for map projection in the Python API.
"""
if isinstance(arr, pyarrow.ChunkedArray):
chunks = [_extract_map_component(chunk, component) for chunk in arr.chunks]
if not chunks:
return pyarrow.chunked_array([], type=_get_result_type(arr.type, component))
return pyarrow.chunked_array(chunks)

child_array = _get_child_array(arr, component)

if child_array is None:
return _make_empty_list_array(arr, component)

return _rebuild_list_array(arr, child_array)


@dataclass
class _MapNamespace:
"""Namespace for map operations on expression columns.

This namespace provides methods for operating on map-typed columns
(including MapArrays and ListArrays of Structs) using PyArrow UDFs.

Example:
>>> from ray.data.expressions import col
>>> # Get keys from map column
>>> expr = col("headers").map.keys()
>>> # Get values from map column
>>> expr = col("headers").map.values()
"""

_expr: "Expr"

def keys(self) -> "UDFExpr":
"""Returns a list expression containing the keys of the map.

Example:
>>> from ray.data.expressions import col
>>> # Get keys from map column
>>> expr = col("headers").map.keys()

Returns:
A list expression containing the keys.
"""
return self._create_projection_udf(MapComponent.KEYS)

def values(self) -> "UDFExpr":
"""Returns a list expression containing the values of the map.

Example:
>>> from ray.data.expressions import col
>>> # Get values from map column
>>> expr = col("headers").map.values()

Returns:
A list expression containing the values.
"""
return self._create_projection_udf(MapComponent.VALUES)

def _create_projection_udf(self, component: MapComponent) -> "UDFExpr":
"""Helper to generate UDFs for map projections."""

return_dtype = DataType(object)
if self._expr.data_type.is_arrow_type():
arrow_type = self._expr.data_type.to_arrow_dtype()

is_physical_map = (
(
pyarrow.types.is_list(arrow_type)
or pyarrow.types.is_large_list(arrow_type)
)
and pyarrow.types.is_struct(arrow_type.value_type)
and arrow_type.value_type.num_fields >= 2
)

inner_arrow_type = None
if pyarrow.types.is_map(arrow_type):
inner_arrow_type = (
arrow_type.key_type
if component == MapComponent.KEYS
else arrow_type.item_type
)
elif is_physical_map:
# List<Struct> map representation: idx 0 is key, idx 1 is value.
idx = 0 if component == MapComponent.KEYS else 1
inner_arrow_type = arrow_type.value_type.field(idx).type

if inner_arrow_type:
return_dtype = DataType.list(DataType.from_arrow(inner_arrow_type))

@pyarrow_udf(return_dtype=return_dtype)
def _project_map(arr: pyarrow.Array) -> pyarrow.Array:
return _extract_map_component(arr, component)

return _project_map(self._expr)
Loading