Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/daft-core/src/lit/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,11 @@ impl<'py> IntoPyObject<'py> for Literal {
"Key and value counts should be equal in map literal"
);

Ok(PyList::new(py, keys.to_literals().zip(values.to_literals()))?.into_any())
let map = PyDict::new(py);
for (key, value) in keys.to_literals().into_iter().zip(values.to_literals()) {
map.set_item(key.into_pyobject(py)?, value.into_pyobject(py)?)?;
Comment on lines +201 to +203

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve duplicate map keys instead of overwriting

Converting map literals to a Python dict will silently drop earlier entries when a map contains duplicate keys, because later set_item calls overwrite prior ones. Arrow map semantics allow duplicate keys and preserve all pairs, so to_pydict() now loses data for any map array or literal with repeated keys (e.g., data sourced from Arrow where a key appears twice). This is a behavioral regression versus the previous list-of-tuples representation which preserved duplicates.

Useful? React with 👍 / 👎.

}
Ok(map.into_any())
}
Self::Tensor { data, shape } => {
let pyarrow = py.import(pyo3::intern!(py, "pyarrow"))?;
Expand Down
28 changes: 10 additions & 18 deletions tests/expressions/test_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ def test_list_value_counts():
value_counts = result.to_pydict()["value_counts"]

# Expected output
expected = [[("a", 2), ("b", 1), ("c", 1)], [("b", 2), ("c", 1)], [("a", 3)], [], [("d", 2)]]
expected = [{"a": 2, "b": 1, "c": 1}, {"b": 2, "c": 1}, {"a": 3}, {}, {"d": 2}]

# Check the result
assert value_counts == expected
Expand Down Expand Up @@ -689,16 +689,8 @@ def test_list_value_counts_nested():

# Apply list_value_counts operation and expect an exception
result = mp.eval_expression_list([col("nested_list_col").value_counts().alias("value_counts")])
result_dict = result.to_pydict()

assert result_dict["value_counts"] == [
[([1, 2], 1), ([3, 4], 1)],
[([1, 2], 1), ([5, 6], 1)],
[([3, 4], 1), ([1, 2], 1)],
[],
[],
[([1, 2], 2)],
]
with pytest.raises(TypeError):
result.to_pydict()


def test_list_value_counts_fixed_size():
Expand Down Expand Up @@ -727,12 +719,12 @@ def test_list_value_counts_fixed_size():
# Verify the value counts
result_dict = result.to_pydict()
assert result_dict["value_counts"] == [
[(1, 1), (2, 1), (3, 1)],
[(4, 2), (3, 1)],
[(4, 1), (5, 1), (6, 1)],
[(1, 1), (2, 1), (3, 1)],
[(7, 1), (8, 1), (9, 1)],
[],
{1: 1, 2: 1, 3: 1},
{4: 2, 3: 1},
{4: 1, 5: 1, 6: 1},
{1: 1, 2: 1, 3: 1},
{7: 1, 8: 1, 9: 1},
{},
]


Expand All @@ -754,7 +746,7 @@ def test_list_value_counts_degenerate():
result_null = null_mp.eval_expression_list([col("null_list_col").value_counts().alias("value_counts")])

# Check the result for null values
assert result_null.to_pydict() == {"value_counts": [[], []]}
assert result_null.to_pydict() == {"value_counts": [{}, {}]}


@pytest.mark.parametrize(
Expand Down
4 changes: 2 additions & 2 deletions tests/recordbatch/test_from_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def test_from_pydict_arrow_map_array() -> None:
# Perform expected Daft cast, where the inner string and int arrays are cast to large string and int arrays.
expected = arrow_arr.cast(pa.map_(pa.int64(), pa.float64()))
assert daft_recordbatch.to_arrow()["a"].combine_chunks() == expected
assert daft_recordbatch.to_pydict()["a"] == data
assert daft_recordbatch.to_pydict()["a"] == [{1: 2.0, 3: 4.0}, None, {5: 6.0, 7: 8.0}]


def test_from_pydict_arrow_struct_array() -> None:
Expand Down Expand Up @@ -524,7 +524,7 @@ def test_from_arrow_map_array() -> None:
# Perform expected Daft cast, where the inner string and int arrays are cast to large string and int arrays.
expected = arrow_arr.cast(pa.map_(pa.float32(), pa.int32()))
assert daft_recordbatch.to_arrow()["a"].combine_chunks() == expected
assert daft_recordbatch.to_pydict()["a"] == data
assert daft_recordbatch.to_pydict()["a"] == [{1.0: 1, 2.0: 2}, {3.0: 3, 4.0: 4}]


@pytest.mark.skipif(
Expand Down
4 changes: 2 additions & 2 deletions tests/series/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ def test_series_concat_map_array(chunks) -> None:
counter = 0
for i in range(chunks):
for j in range(i):
assert concated_list[counter][0][1] == i + j
assert concated_list[counter][1][1] == float(i * j)
assert concated_list[counter]["a"] == i + j
assert concated_list[counter]["b"] == float(i * j)
counter += 1


Expand Down
10 changes: 5 additions & 5 deletions tests/series/test_if_else.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def test_series_if_else_fixed_size_list(if_true, if_false, expected) -> None:
[[("a", 8), ("b", 9)], [("c", 10)], None, [("a", 12), ("b", 13)]],
type=pa.map_(pa.string(), pa.int64()),
),
[[("a", 1), ("b", 2)], [("c", 10)], None, [("a", 5), ("c", 7)]],
[{"a": 1, "b": 2}, {"c": 10}, None, {"a": 5, "c": 7}],
),
# Same length, different super-castable data type
(
Expand All @@ -261,7 +261,7 @@ def test_series_if_else_fixed_size_list(if_true, if_false, expected) -> None:
[[("a", 8), ("b", 9)], [("c", 10)], None, [("a", 12), ("b", 13)]],
type=pa.map_(pa.string(), pa.int64()),
),
[[("a", 1), ("b", 2)], [("c", 10)], None, [("a", 5), ("c", 7)]],
[{"a": 1, "b": 2}, {"c": 10}, None, {"a": 5, "c": 7}],
),
# Broadcast left
(
Expand All @@ -270,7 +270,7 @@ def test_series_if_else_fixed_size_list(if_true, if_false, expected) -> None:
[[("a", 8), ("b", 9)], [("c", 10)], None, [("a", 12), ("b", 13)]],
type=pa.map_(pa.string(), pa.int64()),
),
[[("a", 1), ("b", 2)], [("c", 10)], None, [("a", 1), ("b", 2)]],
[{"a": 1, "b": 2}, {"c": 10}, None, {"a": 1, "b": 2}],
),
# Broadcast right
(
Expand All @@ -279,13 +279,13 @@ def test_series_if_else_fixed_size_list(if_true, if_false, expected) -> None:
type=pa.map_(pa.string(), pa.int64()),
),
pa.array([[("a", 8), ("b", 9)]], type=pa.map_(pa.string(), pa.int64())),
[[("a", 1), ("b", 2)], [("a", 8), ("b", 9)], None, [("a", 5), ("c", 7)]],
[{"a": 1, "b": 2}, {"a": 8, "b": 9}, None, {"a": 5, "c": 7}],
),
# Broadcast both
(
pa.array([[("a", 1), ("b", 2)]], type=pa.map_(pa.string(), pa.int64())),
pa.array([[("a", 8), ("b", 9)]], type=pa.map_(pa.string(), pa.int64())),
[[("a", 1), ("b", 2)], [("a", 8), ("b", 9)], None, [("a", 1), ("b", 2)]],
[{"a": 1, "b": 2}, {"a": 8, "b": 9}, None, {"a": 1, "b": 2}],
),
],
)
Expand Down
Loading