[INF] Require pyspark minimal version is v3.2.0 to cut duplicates codes (#1116)

Zeroto521 · web-flow · commit 8327ec822325 · 2022-06-21T09:34:52.000+08:00
* Require pyspark version is 3.2.0

* directly import `register_dataframe_accessor`

* Update CHANGELOG.md

* force to install version above 3.2.0
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@
 -   [ENH] Performance improvement for groupby_topk. #1093 @samukweku
 -   [ENH] `min_max_scale` drop `old_min` and `old_max` to fit sklearn's method API. Issue #1068 @Zeroto521
 -   [ENH] Add `jointly` option for `min_max_scale` support to transform each column values or entire values. Default transform each column, similar behavior to `sklearn.preprocessing.MinMaxScaler`. (Issue #1067, PR #1112, PR #1123) @Zeroto521
+-   [INF] Require pyspark minimal version is v3.2.0 to cut duplicates codes. Issue #1110 @Zeroto521
+
 
 ## [v0.23.1] - 2022-05-03
 
diff --git a/environment-dev.yml b/environment-dev.yml
@@ -38,7 +38,7 @@ dependencies:
   - pre-commit
   - pyflakes
   - pylint
-  - pyspark=3.1.2
+  - pyspark>=3.2.0
   - pytest
   - pytest-cov
   - pytest-xdist
diff --git a/janitor/spark/backend.py b/janitor/spark/backend.py
@@ -1,84 +1,20 @@
 """ Backend functions for pyspark."""
 
-import warnings
 from functools import wraps
 
-from janitor.utils import import_message
 
+try:
+    from pyspark.pandas.extensions import register_dataframe_accessor
 
-class CachedAccessor:
-    """
-    Custom property-like object (descriptor) for caching accessors.
-
-    Parameters
-    ----------
-    name : str
-        The namespace this will be accessed under, e.g. `df.foo`
-    accessor : cls
-        The class with the extension methods.
-
-    NOTE
-    ----
-    Modified based on pandas.core.accessor.
-    """
-
-    def __init__(self, name, accessor):
-        self._name = name
-        self._accessor = accessor
-
-    def __get__(self, obj, cls):
-        if obj is None:
-            # we're accessing the attribute of the class, i.e., Dataset.geo
-            return self._accessor
-        accessor_obj = self._accessor(obj)
-        # Replace the property with the accessor object. Inspired by:
-        # http://www.pydanny.com/cached-property.html
-        setattr(obj, self._name, accessor_obj)
-        return accessor_obj
-
-
-def _register_accessor(name, cls):
-    """
-    NOTE
-    ----
-    Modified based on pandas.core.accessor.
-    """
-
-    def decorator(accessor):
-        if hasattr(cls, name):
-            warnings.warn(
-                "registration of accessor {!r} under name {!r} for type "
-                "{!r} is overriding a preexisting attribute with the same "
-                "name.".format(accessor, name, cls),
-                UserWarning,
-                stacklevel=2,
-            )
-        setattr(cls, name, CachedAccessor(name, accessor))
-        return accessor
-
-    return decorator
-
-
-def register_dataframe_accessor(name):
-    """
-    NOTE
-    ----
-    Modified based on pandas.core.accessor.
-
-    .. # noqa: DAR101 name
-    .. # noqa: DAR201
-    """
-    try:
-        from pyspark.sql import DataFrame
-    except ImportError:
-        import_message(
-            submodule="spark",
-            package="pyspark",
-            conda_channel="conda-forge",
-            pip_install=True,
-        )
+except ImportError:
+    from janitor.utils import import_message
 
-    return _register_accessor(name, DataFrame)
+    import_message(
+        submodule="spark",
+        package="pyspark",
+        conda_channel="conda-forge",
+        pip_install=True,
+    )
 
 
 def register_dataframe_method(method):