diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst index 05353fde71daa..7e796c69dc27e 100644 --- a/python/docs/source/reference/pyspark.pandas/indexing.rst +++ b/python/docs/source/reference/pyspark.pandas/indexing.rst @@ -269,6 +269,7 @@ MultiIndex Modifying and computations :toctree: api/ MultiIndex.equals + MultiIndex.equal_levels MultiIndex.identical MultiIndex.insert MultiIndex.drop diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index 1ff7e1d14ffb5..cff3e2689bf3c 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -16,7 +16,7 @@ # from distutils.version import LooseVersion -from functools import partial +from functools import partial, reduce from typing import Any, Callable, Iterator, List, Optional, Tuple, Union, cast, no_type_check import pandas as pd @@ -1137,6 +1137,41 @@ def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIn ) return cast(MultiIndex, DataFrame(internal).index) + def equal_levels(self, other: "MultiIndex") -> bool: + """ + Return True if the levels of both MultiIndex objects are the same + + .. versionadded:: 3.3.0 + + Examples + -------- + >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + >>> psmidx1.equal_levels(psmidx2) + True + + >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + >>> psmidx1.equal_levels(psmidx2) + False + """ + nlevels = self.nlevels + if nlevels != other.nlevels: + return False + + self_sdf = self._internal.spark_frame + other_sdf = other._internal.spark_frame + subtract_list = [] + for nlevel in range(nlevels): + self_index_scol = self._internal.index_spark_columns[nlevel] + other_index_scol = other._internal.index_spark_columns[nlevel] + self_subtract_other = self_sdf.select(self_index_scol).subtract( + other_sdf.select(other_index_scol) + ) + subtract_list.append(self_subtract_other) + + unioned_subtracts = reduce(lambda x, y: x.union(y), subtract_list) + return len(unioned_subtracts.head(1)) == 0 + @property def hasnans(self) -> bool: raise NotImplementedError("hasnans is not defined for MultiIndex") diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py index e81156fdd74af..4170aa70f7d4c 100644 --- a/python/pyspark/pandas/missing/indexes.py +++ b/python/pyspark/pandas/missing/indexes.py @@ -105,7 +105,6 @@ class MissingPandasLikeMultiIndex(object): # Functions argsort = _unsupported_function("argsort") asof_locs = _unsupported_function("asof_locs") - equal_levels = _unsupported_function("equal_levels") factorize = _unsupported_function("factorize") format = _unsupported_function("format") get_indexer = _unsupported_function("get_indexer") diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index 3edc83d336495..40039983c4c11 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -2388,6 +2388,41 @@ def test_map(self): lambda: psidx.map({1: 1, 2: 2.0, 3: "three"}), ) + def test_multiindex_equal_levels(self): + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")]) + pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + def test_to_numpy(self): pidx = pd.Index([1, 2, 3, 4]) psidx = ps.from_pandas(pidx)