-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
BUG: Dataframe arithmatic operators don't work with Series using fill_value #61828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 7 commits
99ae672
4e77fb7
eb12b34
7e23b65
4617108
bca56fe
7273396
4493e08
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -890,7 +890,14 @@ def _op_method_error_message(self, other, op) -> str: | |
def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: | ||
pa_type = self._pa_array.type | ||
other_original = other | ||
other = self._box_pa(other) | ||
try: | ||
other = self._box_pa(other) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this change necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found errors when handling Categorical (straight conversion error) and Offsets (No temporal attributes found on object) This is essentially a wrapper to make it clearer to the user where it went wrong |
||
except (ValueError, pa.lib.ArrowTypeError) as err: | ||
# Categorical and Interval dtype raises errors in self._box_pa | ||
# Could be fixed in the future if needed | ||
raise TypeError( | ||
"Incompatible type when converting to PyArrow dtype for operation." | ||
) from err | ||
|
||
if ( | ||
pa.types.is_string(pa_type) | ||
|
@@ -899,6 +906,13 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: | |
): | ||
if op in [operator.add, roperator.radd]: | ||
sep = pa.scalar("", type=pa_type) | ||
if ( | ||
pa.types.is_string(other.type) | ||
or pa.types.is_large_string(other.type) | ||
or pa.types.is_binary(other.type) | ||
or isna(other).all() | ||
): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We want to limit it to only string + string/null. Allowing other datatypes (eg. TimeDelta, etc) causes weird conversion issues due to being cast directly to string. The last line is there to prevent unwanted conversions from arrays with any null type |
||
other = other.cast(pa_type) | ||
try: | ||
if op is operator.add: | ||
result = pc.binary_join_element_wise(self._pa_array, other, sep) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -626,11 +626,43 @@ def test_arith_flex_frame_corner(self, float_frame): | |
expected = float_frame.sort_index() * np.nan | ||
tm.assert_frame_equal(result, expected) | ||
|
||
with pytest.raises(NotImplementedError, match="fill_value"): | ||
float_frame.add(float_frame.iloc[0], fill_value=3) | ||
@pytest.mark.parametrize("axis", [0, 1]) | ||
def test_arith_flex_frame_fill_value_corner(self, float_frame, axis): | ||
rng = np.random.default_rng(60) | ||
mask = rng.random(float_frame.shape) < 0.2 | ||
left = float_frame.mask(mask) | ||
right = left.iloc[0] | ||
|
||
result = left.add(right, axis=axis, fill_value=3) | ||
|
||
if axis == 0: # axis = index, vertical | ||
pad_num = abs(result.shape[0] - len(right)) | ||
mult_num = result.shape[1] | ||
right_pad = np.pad( | ||
right, (0, pad_num), mode="constant", constant_values=(np.nan) | ||
) | ||
right_df = DataFrame( | ||
[right_pad] * mult_num, columns=result.index, index=result.columns | ||
).T | ||
|
||
left = left.reindex_like(result) | ||
|
||
else: # axis = columns, horizontal | ||
pad_num = abs(result.shape[1] - len(right)) | ||
mult_num = result.shape[0] | ||
right_pad = np.pad( | ||
right, (0, pad_num), mode="constant", constant_values=(np.nan) | ||
) | ||
right_df = DataFrame( | ||
[right_pad] * mult_num, index=result.index, columns=result.columns | ||
) | ||
|
||
with pytest.raises(NotImplementedError, match="fill_value"): | ||
float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) | ||
left_filled = left.fillna(3) | ||
right_filled = right_df.fillna(3) | ||
expected = right_filled + left_filled | ||
expected = expected.mask(expected == 6, pd.NA) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("op", ["add", "sub", "mul", "mod"]) | ||
def test_arith_flex_series_ops(self, simple_frame, op): | ||
|
@@ -672,11 +704,21 @@ def test_arith_flex_zero_len_raises(self): | |
df_len0 = DataFrame(columns=["A", "B"]) | ||
df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) | ||
|
||
with pytest.raises(NotImplementedError, match="fill_value"): | ||
msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'" | ||
with pytest.raises(TypeError, match=msg): | ||
df.add(ser_len0, fill_value="E") | ||
|
||
with pytest.raises(NotImplementedError, match="fill_value"): | ||
df_len0.sub(df["A"], axis=None, fill_value=3) | ||
result = df_len0.sub(df, axis=None, fill_value=3) | ||
expected = DataFrame([[2, 1], [0, -1]], columns=["A", "B"]) | ||
tm.assert_frame_equal(result, expected, check_dtype=False) | ||
|
||
result = df_len0.sub(df["A"], axis=0, fill_value=3) | ||
expected = DataFrame([[2, 2], [0, 0]], columns=["A", "B"]) | ||
tm.assert_frame_equal(result, expected, check_dtype=False) | ||
|
||
result = df_len0.sub(df["A"], axis=1, fill_value=3) | ||
expected = DataFrame([], columns=["A", "B", 0, 1]) | ||
tm.assert_frame_equal(result, expected, check_dtype=False) | ||
|
||
def test_flex_add_scalar_fill_value(self): | ||
# GH#12723 | ||
|
@@ -2192,3 +2234,61 @@ def test_mixed_col_index_dtype(string_dtype_no_object): | |
expected.columns = expected.columns.astype(string_dtype_no_object) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
dt_params = [ | ||
(tm.ALL_INT_NUMPY_DTYPES[0], 5), | ||
(tm.ALL_INT_EA_DTYPES[0], 5), | ||
(tm.FLOAT_NUMPY_DTYPES[0], 4.9), | ||
(tm.FLOAT_EA_DTYPES[0], 4.9), | ||
] | ||
|
||
axes = [0, 1] | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"data_type,fill_val, axis", | ||
[(dt, val, axis) for axis in axes for dt, val in dt_params], | ||
) | ||
def test_df_fill_value_dtype(data_type, fill_val, axis): | ||
# GH 61581 | ||
base_data = np.arange(25).reshape(5, 5) | ||
mult_list = [1, np.nan, 5, np.nan, 3] | ||
np_int_flag = 0 | ||
|
||
try: | ||
mult_data = pd.array(mult_list, dtype=data_type) | ||
except ValueError as e: | ||
# Numpy int type cannot represent NaN, it will end up here | ||
if "cannot convert float NaN to integer" in str(e): | ||
mult_data = np.asarray(mult_list) | ||
np_int_flag = 1 | ||
|
||
columns = list("ABCDE") | ||
df = DataFrame(base_data, columns=columns) | ||
|
||
for i in range(df.shape[0]): | ||
try: | ||
df.iat[i, i] = np.nan | ||
df.iat[i + 1, i] = pd.NA | ||
df.iat[i + 3, i] = pd.NA | ||
|
||
except IndexError: | ||
pass | ||
|
||
mult_mat = np.broadcast_to(mult_data, df.shape) | ||
if axis == 0: | ||
mask = np.isnan(mult_mat).T | ||
else: | ||
mask = np.isnan(mult_mat) | ||
mask = df.isna().values & mask | ||
|
||
df_result = df.mul(mult_data, axis=axis, fill_value=fill_val) | ||
if np_int_flag == 1: | ||
mult_np = np.nan_to_num(mult_data, nan=fill_val) | ||
df_expected = (df.fillna(fill_val).mul(mult_np, axis=axis)).mask(mask, np.nan) | ||
else: | ||
df_expected = ( | ||
df.fillna(fill_val).mul(mult_data.fillna(fill_val), axis=axis) | ||
).mask(mask, np.nan) | ||
|
||
tm.assert_frame_equal(df_result, df_expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
did it not before?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sort of. The current implementation returns object types within the dataframe when doing addition, it now returns string when adding to strings
Additionally, it would error on any null values, that is now fixed, and it returns a string null
I can edit the doc to make that clearer