From aa86386588680e62ba86d7fa51ec1471054da406 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Sep 2025 08:07:26 -0700 Subject: [PATCH 1/2] REF: simplify _append_internal --- pandas/core/frame.py | 72 +++++++------------ pandas/core/reshape/pivot.py | 2 +- pandas/core/series.py | 16 +---- pandas/tests/reshape/concat/test_append.py | 71 ++++++------------ .../reshape/concat/test_append_common.py | 41 ----------- .../tests/reshape/concat/test_categorical.py | 3 - pandas/tests/reshape/concat/test_index.py | 15 ---- 7 files changed, 53 insertions(+), 167 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e649f4bd60e31..ec9ed4261f60f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10893,61 +10893,43 @@ def infer(x): def _append_internal( self, - other, + other: Series, ignore_index: bool = False, - verify_integrity: bool = False, - sort: bool = False, ) -> DataFrame: - if isinstance(other, (Series, dict)): - if isinstance(other, dict): - if not ignore_index: - raise TypeError("Can only append a dict if ignore_index=True") - other = Series(other) - if other.name is None and not ignore_index: - raise TypeError( - "Can only append a Series if ignore_index=True " - "or if the Series has a name" - ) + assert isinstance(other, Series), type(other) - index = Index( - [other.name], - name=( - self.index.names - if isinstance(self.index, MultiIndex) - else self.index.name - ), + if other.name is None and not ignore_index: + raise TypeError( + "Can only append a Series if ignore_index=True " + "or if the Series has a name" ) - row_df = other.to_frame().T - if isinstance(self.index.dtype, ExtensionDtype): - # GH#41626 retain e.g. CategoricalDtype if reached via - # df.loc[key] = item - row_df.index = self.index.array._cast_pointwise_result( - row_df.index._values - ) - # infer_objects is needed for - # test_append_empty_frame_to_series_with_dateutil_tz - other = row_df.infer_objects().rename_axis(index.names) - elif isinstance(other, list): - if not other: - pass - elif not isinstance(other[0], DataFrame): - other = DataFrame(other) - if self.index.name is not None and not ignore_index: - other.index.name = self.index.name + index = Index( + [other.name], + name=( + self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name + ), + ) - from pandas.core.reshape.concat import concat + row_df = other.to_frame().T + if isinstance(self.index.dtype, ExtensionDtype): + # GH#41626 retain e.g. CategoricalDtype if reached via + # df.loc[key] = item + row_df.index = self.index.array._cast_pointwise_result( + row_df.index._values + ) - if isinstance(other, (list, tuple)): - to_concat = [self, *other] - else: - to_concat = [self, other] + # infer_objects is needed for + # test_append_empty_frame_to_series_with_dateutil_tz + other = row_df.infer_objects().rename_axis(index.names) + + from pandas.core.reshape.concat import concat result = concat( - to_concat, + [self, other], ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort, ) return result.__finalize__(self, method="append") diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 8b7ca9f437268..5320e6b95e406 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -504,7 +504,7 @@ def _add_margins( margin_dummy[cols] = margin_dummy[cols].apply( maybe_downcast_to_dtype, args=(dtype,) ) - result = result._append_internal(margin_dummy) + result = concat([result, margin_dummy]) result.index.names = row_names return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 56ef313d1a73a..780a60f6b9b0c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2989,22 +2989,10 @@ def searchsorted( # type: ignore[override] # ------------------------------------------------------------------- # Combination - def _append_internal( - self, to_append, ignore_index: bool = False, verify_integrity: bool = False - ): + def _append_internal(self, to_append: Series, ignore_index: bool = False) -> Series: from pandas.core.reshape.concat import concat - if isinstance(to_append, (list, tuple)): - to_concat = [self] - to_concat.extend(to_append) - else: - to_concat = [self, to_append] - if any(isinstance(x, (ABCDataFrame,)) for x in to_concat[1:]): - msg = "to_append should be a Series or list/tuple of Series, got DataFrame" - raise TypeError(msg) - return concat( - to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity - ) + return concat([self, to_append], ignore_index=ignore_index) @doc( _shared_docs["compare"], diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index b237195d0350c..258d0e89fda7a 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -28,23 +28,23 @@ def test_append(self, sort, float_frame): begin_frame = float_frame.reindex(begin_index) end_frame = float_frame.reindex(end_index) - appended = begin_frame._append_internal(end_frame) + appended = concat([begin_frame, end_frame]) tm.assert_almost_equal(appended["A"], float_frame["A"]) del end_frame["A"] - partial_appended = begin_frame._append_internal(end_frame, sort=sort) + partial_appended = concat([begin_frame, end_frame], sort=sort) assert "A" in partial_appended - partial_appended = end_frame._append_internal(begin_frame, sort=sort) + partial_appended = concat([end_frame, begin_frame], sort=sort) assert "A" in partial_appended # mixed type handling - appended = mixed_frame[:5]._append_internal(mixed_frame[5:]) + appended = concat([mixed_frame[:5], mixed_frame[5:]]) tm.assert_frame_equal(appended, mixed_frame) # what to test here - mixed_appended = mixed_frame[:5]._append_internal(float_frame[5:], sort=sort) - mixed_appended2 = float_frame[:5]._append_internal(mixed_frame[5:], sort=sort) + mixed_appended = concat([mixed_frame[:5], float_frame[5:]], sort=sort) + mixed_appended2 = concat([float_frame[:5], mixed_frame[5:]], sort=sort) # all equal except 'foo' column tm.assert_frame_equal( @@ -55,18 +55,18 @@ def test_append(self, sort, float_frame): def test_append_empty(self, float_frame): empty = DataFrame() - appended = float_frame._append_internal(empty) + appended = concat([float_frame, empty]) tm.assert_frame_equal(float_frame, appended) assert appended is not float_frame - appended = empty._append_internal(float_frame) + appended = concat([empty, float_frame]) tm.assert_frame_equal(float_frame, appended) assert appended is not float_frame def test_append_overlap_raises(self, float_frame): msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): - float_frame._append_internal(float_frame, verify_integrity=True) + concat([float_frame, float_frame], verify_integrity=True) def test_append_new_columns(self): # see gh-6129: new columns @@ -85,7 +85,7 @@ def test_append_new_columns(self): def test_append_length0_frame(self, sort): df = DataFrame(columns=["A", "B", "C"]) df3 = DataFrame(index=[0, 1], columns=["A", "B"]) - df5 = df._append_internal(df3, sort=sort) + df5 = concat([df, df3], sort=sort) expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) tm.assert_frame_equal(df5, expected) @@ -100,7 +100,7 @@ def test_append_records(self): df1 = DataFrame(arr1) df2 = DataFrame(arr2) - result = df1._append_internal(df2, ignore_index=True) + result = concat([df1, df2], ignore_index=True) expected = DataFrame(np.concatenate((arr1, arr2))) tm.assert_frame_equal(result, expected) @@ -109,7 +109,7 @@ def test_append_sorts(self, sort): df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) - result = df1._append_internal(df2, sort=sort) + result = concat([df1, df2], sort=sort) # for None / True expected = DataFrame( @@ -133,28 +133,10 @@ def test_append_different_columns(self, sort): a = df[:5].loc[:, ["bools", "ints", "floats"]] b = df[5:].loc[:, ["strings", "ints", "floats"]] - appended = a._append_internal(b, sort=sort) + appended = concat([a, b], sort=sort) assert isna(appended["strings"][0:4]).all() assert isna(appended["bools"][5:]).all() - def test_append_many(self, sort, float_frame): - chunks = [ - float_frame[:5], - float_frame[5:10], - float_frame[10:15], - float_frame[15:], - ] - - result = chunks[0]._append_internal(chunks[1:]) - tm.assert_frame_equal(result, float_frame) - - chunks[-1] = chunks[-1].copy() - chunks[-1]["foo"] = "bar" - result = chunks[0]._append_internal(chunks[1:], sort=sort) - tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) - assert (result["foo"][15:] == "bar").all() - assert result["foo"][:15].isna().all() - def test_append_preserve_index_name(self): # #980 df1 = DataFrame(columns=["A", "B", "C"]) @@ -162,7 +144,7 @@ def test_append_preserve_index_name(self): df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) df2 = df2.set_index(["A"]) - result = df1._append_internal(df2) + result = concat([df1, df2]) assert result.index.name == "A" indexes_can_append = [ @@ -285,7 +267,7 @@ def test_append_dtype_coerce(self, sort): axis=1, sort=sort, ) - result = df1._append_internal(df2, ignore_index=True, sort=sort) + result = concat([df1, df2], ignore_index=True, sort=sort) if sort: expected = expected[["end_time", "start_time"]] else: @@ -297,7 +279,7 @@ def test_append_missing_column_proper_upcast(self, sort): df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)}) - appended = df1._append_internal(df2, ignore_index=True, sort=sort) + appended = concat([df1, df2], sort=sort) assert appended["A"].dtype == "f8" assert appended["B"].dtype == "O" @@ -323,27 +305,20 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): result_b = result_a._append_internal(ser, ignore_index=True) tm.assert_frame_equal(result_b, expected) - result = df._append_internal([ser, ser], ignore_index=True) - tm.assert_frame_equal(result, expected) - def test_append_empty_tz_frame_with_datetime64ns(self): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") - # pd.NaT gets inferred as tz-naive, so append result is tz-naive - result = df._append_internal({"a": pd.NaT}, ignore_index=True) - expected = DataFrame({"a": [pd.NaT]}, dtype=object) - tm.assert_frame_equal(result, expected) - # also test with typed value to append df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") - other = Series({"a": pd.NaT}, dtype="datetime64[ns]") - result = df._append_internal(other, ignore_index=True) + other = Series({"a": pd.NaT}, dtype="datetime64[ns]").to_frame().T + result = concat([df, other], ignore_index=True) + expected = DataFrame({"a": [pd.NaT]}, dtype=object) tm.assert_frame_equal(result, expected) # mismatched tz - other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]") - result = df._append_internal(other, ignore_index=True) + other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]").to_frame().T + result = concat([df, other], ignore_index=True) expected = DataFrame({"a": [pd.NaT]}).astype(object) tm.assert_frame_equal(result, expected) @@ -356,7 +331,7 @@ def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): df = DataFrame(columns=["a"]).astype(dtype_str) other = DataFrame({"a": [np.timedelta64(val, "ns")]}) - result = df._append_internal(other, ignore_index=True) + result = concat([df, other]) expected = other.astype(object) tm.assert_frame_equal(result, expected) @@ -370,7 +345,7 @@ def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val): df = DataFrame({"a": pd.array([1], dtype=dtype_str)}) other = DataFrame({"a": [np.timedelta64(val, "ns")]}) - result = df._append_internal(other, ignore_index=True) + result = concat([df, other], ignore_index=True) expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index 06b283062f4c1..f49ff8310f855 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -132,12 +132,7 @@ def test_concatlike_same_dtypes(self, item): tm.assert_series_equal(res, exp, check_index_type=True) # 3 elements - res = Series(vals1)._append_internal( - [Series(vals2), Series(vals3)], ignore_index=True - ) exp = Series(exp_data3) - tm.assert_series_equal(res, exp) - res = pd.concat( [Series(vals1), Series(vals2), Series(vals3)], ignore_index=True, @@ -169,12 +164,6 @@ def test_concatlike_same_dtypes(self, item): r"cannot concatenate object of type '.+'; " "only Series and DataFrame objs are valid" ) - with pytest.raises(TypeError, match=msg): - Series(vals1)._append_internal(vals2) - - with pytest.raises(TypeError, match=msg): - Series(vals1)._append_internal([Series(vals2), vals3]) - with pytest.raises(TypeError, match=msg): pd.concat([Series(vals1), vals2]) @@ -246,13 +235,7 @@ def test_concatlike_dtypes_coercion(self, item, item2, request): # 3 elements # GH#39817 - res = Series(vals1)._append_internal( - [Series(vals2), Series(vals3)], ignore_index=True - ) exp = Series(exp_data3, dtype=exp_series_dtype) - tm.assert_series_equal(res, exp) - - # GH#39817 res = pd.concat( [Series(vals1), Series(vals2), Series(vals3)], ignore_index=True, @@ -326,7 +309,6 @@ def test_concatlike_datetimetz_short(self, tz): ).as_unit("ns") exp = DataFrame(0, index=exp_idx, columns=["A", "B"]) - tm.assert_frame_equal(df1._append_internal(df2), exp) tm.assert_frame_equal(pd.concat([df1, df2]), exp) def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): @@ -586,11 +568,9 @@ def test_concat_categorical_3elem_coercion(self): exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal([s2, s3], ignore_index=True), exp) exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s3._append_internal([s1, s2], ignore_index=True), exp) # values are all in either category => not-category s1 = Series([4, 5, 6], dtype="category") @@ -599,11 +579,9 @@ def test_concat_categorical_3elem_coercion(self): exp = Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal([s2, s3], ignore_index=True), exp) exp = Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s3._append_internal([s1, s2], ignore_index=True), exp) # values are all in either category => not-category s1 = Series([4, 5, 6], dtype="category") @@ -612,11 +590,9 @@ def test_concat_categorical_3elem_coercion(self): exp = Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal([s2, s3], ignore_index=True), exp) exp = Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s3._append_internal([s1, s2], ignore_index=True), exp) def test_concat_categorical_multi_coercion(self): # GH 13524 @@ -632,14 +608,10 @@ def test_concat_categorical_multi_coercion(self): exp = Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) tm.assert_series_equal(res, exp) - res = s1._append_internal([s2, s3, s4, s5, s6], ignore_index=True) - tm.assert_series_equal(res, exp) exp = Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) tm.assert_series_equal(res, exp) - res = s6._append_internal([s5, s4, s3, s2, s1], ignore_index=True) - tm.assert_series_equal(res, exp) def test_concat_categorical_ordered(self): # GH 13524 @@ -649,11 +621,9 @@ def test_concat_categorical_ordered(self): exp = Series(Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp) exp = Series(Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True)) tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal([s2, s1], ignore_index=True), exp) def test_concat_categorical_coercion_nan(self): # GH 13524 @@ -665,14 +635,12 @@ def test_concat_categorical_coercion_nan(self): exp = Series([np.nan, np.nan, np.nan, 1]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp) s1 = Series([1, np.nan], dtype="category") s2 = Series([np.nan, np.nan]) exp = Series([1, np.nan, np.nan, np.nan], dtype="float") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp) # mixed dtype, all nan-likes => not-category s1 = Series([np.nan, np.nan], dtype="category") @@ -680,9 +648,7 @@ def test_concat_categorical_coercion_nan(self): exp = Series([np.nan, np.nan, np.nan, np.nan]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2._append_internal(s1, ignore_index=True), exp) # all category nan-likes => category s1 = Series([np.nan, np.nan], dtype="category") @@ -691,7 +657,6 @@ def test_concat_categorical_coercion_nan(self): exp = Series([np.nan, np.nan, np.nan, np.nan], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp) def test_concat_categorical_empty(self): # GH 13524 @@ -725,10 +690,8 @@ def test_concat_categorical_empty(self): exp = Series([np.nan, np.nan], dtype=object) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2._append_internal(s1, ignore_index=True), exp) def test_categorical_concat_append(self): cat = Categorical(["a", "b"], categories=["a", "b"]) @@ -739,7 +702,6 @@ def test_categorical_concat_append(self): exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1])) tm.assert_frame_equal(pd.concat([df, df]), exp) - tm.assert_frame_equal(df._append_internal(df), exp) # GH 13524 can concat different categories cat3 = Categorical(["a", "b"], categories=["a", "b", "c"]) @@ -749,6 +711,3 @@ def test_categorical_concat_append(self): res = pd.concat([df, df_different_categories], ignore_index=True) exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) tm.assert_frame_equal(res, exp) - - res = df._append_internal(df_different_categories, ignore_index=True) - tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index 3445ac5a68fb6..8cd02b4989202 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -219,9 +219,6 @@ def test_categorical_concat_gh7864(self): dfx = pd.concat([df1, df2]) tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories) - dfa = df1._append_internal(df2) - tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) - def test_categorical_index_upcast(self): # GH 17629 # test upcasting to object when concatenating on categorical indexes diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 6de1b9bd65a3c..ace0776ae6143 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -190,21 +190,6 @@ def test_dups_index(self): tm.assert_frame_equal(result.iloc[:10], df) tm.assert_frame_equal(result.iloc[10:], df) - # append - result = df.iloc[0:8, :]._append_internal(df.iloc[8:]) - tm.assert_frame_equal(result, df) - - result = ( - df.iloc[0:8, :] - ._append_internal(df.iloc[8:9]) - ._append_internal(df.iloc[9:10]) - ) - tm.assert_frame_equal(result, df) - - expected = concat([df, df], axis=0) - result = df._append_internal(df) - tm.assert_frame_equal(result, expected) - class TestMultiIndexConcat: def test_concat_multiindex_with_keys(self, multiindex_dataframe_random_data): From db8b16635c3a9f5edbe01abe7ae0d34bd6dd406f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Sep 2025 13:33:00 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/core/frame.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec9ed4261f60f..cc03900b71442 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10917,18 +10917,16 @@ def _append_internal( if isinstance(self.index.dtype, ExtensionDtype): # GH#41626 retain e.g. CategoricalDtype if reached via # df.loc[key] = item - row_df.index = self.index.array._cast_pointwise_result( - row_df.index._values - ) + row_df.index = self.index.array._cast_pointwise_result(row_df.index._values) # infer_objects is needed for # test_append_empty_frame_to_series_with_dateutil_tz - other = row_df.infer_objects().rename_axis(index.names) + row_df = row_df.infer_objects().rename_axis(index.names) from pandas.core.reshape.concat import concat result = concat( - [self, other], + [self, row_df], ignore_index=ignore_index, ) return result.__finalize__(self, method="append")