Skip to content

Commit 8327ec8

Browse files
authored
[INF] Require pyspark minimal version is v3.2.0 to cut duplicates codes (#1116)
* Require pyspark version is 3.2.0 * directly import `register_dataframe_accessor` * Update CHANGELOG.md * force to install version above 3.2.0
1 parent a06778a commit 8327ec8

File tree

3 files changed

+13
-75
lines changed

3 files changed

+13
-75
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
- [ENH] Performance improvement for groupby_topk. #1093 @samukweku
1010
- [ENH] `min_max_scale` drop `old_min` and `old_max` to fit sklearn's method API. Issue #1068 @Zeroto521
1111
- [ENH] Add `jointly` option for `min_max_scale` support to transform each column values or entire values. Default transform each column, similar behavior to `sklearn.preprocessing.MinMaxScaler`. (Issue #1067, PR #1112, PR #1123) @Zeroto521
12+
- [INF] Require pyspark minimal version is v3.2.0 to cut duplicates codes. Issue #1110 @Zeroto521
13+
1214

1315
## [v0.23.1] - 2022-05-03
1416

environment-dev.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ dependencies:
3838
- pre-commit
3939
- pyflakes
4040
- pylint
41-
- pyspark=3.1.2
41+
- pyspark>=3.2.0
4242
- pytest
4343
- pytest-cov
4444
- pytest-xdist

janitor/spark/backend.py

Lines changed: 10 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,20 @@
11
""" Backend functions for pyspark."""
22

3-
import warnings
43
from functools import wraps
54

6-
from janitor.utils import import_message
75

6+
try:
7+
from pyspark.pandas.extensions import register_dataframe_accessor
88

9-
class CachedAccessor:
10-
"""
11-
Custom property-like object (descriptor) for caching accessors.
12-
13-
Parameters
14-
----------
15-
name : str
16-
The namespace this will be accessed under, e.g. `df.foo`
17-
accessor : cls
18-
The class with the extension methods.
19-
20-
NOTE
21-
----
22-
Modified based on pandas.core.accessor.
23-
"""
24-
25-
def __init__(self, name, accessor):
26-
self._name = name
27-
self._accessor = accessor
28-
29-
def __get__(self, obj, cls):
30-
if obj is None:
31-
# we're accessing the attribute of the class, i.e., Dataset.geo
32-
return self._accessor
33-
accessor_obj = self._accessor(obj)
34-
# Replace the property with the accessor object. Inspired by:
35-
# http://www.pydanny.com/cached-property.html
36-
setattr(obj, self._name, accessor_obj)
37-
return accessor_obj
38-
39-
40-
def _register_accessor(name, cls):
41-
"""
42-
NOTE
43-
----
44-
Modified based on pandas.core.accessor.
45-
"""
46-
47-
def decorator(accessor):
48-
if hasattr(cls, name):
49-
warnings.warn(
50-
"registration of accessor {!r} under name {!r} for type "
51-
"{!r} is overriding a preexisting attribute with the same "
52-
"name.".format(accessor, name, cls),
53-
UserWarning,
54-
stacklevel=2,
55-
)
56-
setattr(cls, name, CachedAccessor(name, accessor))
57-
return accessor
58-
59-
return decorator
60-
61-
62-
def register_dataframe_accessor(name):
63-
"""
64-
NOTE
65-
----
66-
Modified based on pandas.core.accessor.
67-
68-
.. # noqa: DAR101 name
69-
.. # noqa: DAR201
70-
"""
71-
try:
72-
from pyspark.sql import DataFrame
73-
except ImportError:
74-
import_message(
75-
submodule="spark",
76-
package="pyspark",
77-
conda_channel="conda-forge",
78-
pip_install=True,
79-
)
9+
except ImportError:
10+
from janitor.utils import import_message
8011

81-
return _register_accessor(name, DataFrame)
12+
import_message(
13+
submodule="spark",
14+
package="pyspark",
15+
conda_channel="conda-forge",
16+
pip_install=True,
17+
)
8218

8319

8420
def register_dataframe_method(method):

0 commit comments

Comments
 (0)