Skip to content

Commit 23213e5

Browse files
authored
Linting, unit test, and yates check (#3)
* Adding unit tests * Redoing requirements to be cleaner. Also added pre-commit and auto-linting. * flake8 and black done on setup * Linting and Yate's correction factor check on pycorrcat * Added an extra unit test to check the correlation matrix output.
1 parent e1e8ea9 commit 23213e5

File tree

12 files changed

+231
-81
lines changed

12 files changed

+231
-81
lines changed

.flake8

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[flake8]
2+
ignore = E501, W503

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,6 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
131+
# IDE
132+
.idea/

.isort.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[settings]
2+
known_third_party = matplotlib,numpy,pandas,pip,scipy,seaborn,setuptools

.pre-commit-config.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
repos:
2+
- repo: https://github.com/psf/black
3+
rev: 20.8b1
4+
hooks:
5+
- id: black
6+
language_version: python3
7+
- repo: https://github.com/asottile/seed-isort-config
8+
rev: v2.2.0
9+
hooks:
10+
- id: seed-isort-config
11+
- repo: https://github.com/timothycrosley/isort
12+
rev: 5.5.1
13+
hooks:
14+
- id: isort
15+
- repo: https://github.com/Lucas-C/pre-commit-hooks-safety
16+
rev: v1.1.3
17+
hooks:
18+
- id: python-safety-dependencies-check
19+
- repo: https://gitlab.com/pycqa/flake8
20+
rev: 3.8.3
21+
hooks:
22+
- id: flake8

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,14 @@ plot_corr(df, ['dogs','cats'] )
1616
```
1717

1818
## Development setup
19-
Create a virtualenv and install dependencies from ```requirements.txt``` and continue with code change.
19+
Create a virtualenv and install dependencies:
20+
- `pip install -r requirements.dev.txt`
21+
- `pip install -r requirements.txt`
22+
Then install the pre-commit hooks: `pre-commit install` and continue with code change.
23+
24+
### Run `pre-commit` locally to check files
25+
26+
`pre-commit run --all-files`
2027

2128
## Release History
2229

pycorrcat/pycorrcat.py

Lines changed: 65 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,29 @@
11
import warnings
2-
import pandas as pd
3-
import scipy.stats as stats
4-
import numpy as np
52
from typing import List
3+
64
import matplotlib as matplotlib
5+
import numpy as np
6+
import pandas as pd
7+
import scipy.stats as stats
78
import seaborn as sns
89

10+
911
def fillna(object):
1012
if isinstance(object, pd.Series):
1113
return object.fillna(0)
1214
else:
1315
return np.array([value if value is not None else 0 for value in object])
1416

15-
def corr(x,
16-
y,
17-
bias_correction=True,
18-
Tschuprow=False):
17+
18+
def corr(x, y, bias_correction=True, Tschuprow=False):
1919
"""
2020
Calculates correlation statistic for categorical-categorical association.
2121
The two measures supported are:
2222
1. Cramer'V ( default )
2323
2. Tschuprow'T
2424
2525
Bias correction and formula's taken from : https://www.researchgate.net/publication/270277061_A_bias-correction_for_Cramer's_V_and_Tschuprow's_T
26-
26+
2727
Wikipedia for Cramer's V: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
2828
Wikipedia for Tschuprow' T: https://en.wikipedia.org/wiki/Tschuprow%27s_T
2929
Parameters:
@@ -44,34 +44,48 @@ def corr(x,
4444
x, y = fillna(x), fillna(y)
4545
crosstab_matrix = pd.crosstab(x, y)
4646
n_observations = crosstab_matrix.sum().sum()
47-
chi2, p, dof, expected = stats.chi2_contingency(crosstab_matrix)
47+
48+
yates_correct = True
49+
if bias_correction:
50+
if crosstab_matrix.shape == (2, 2):
51+
yates_correct = False
52+
53+
chi2, _, _, _ = stats.chi2_contingency(
54+
crosstab_matrix, correction=yates_correct
55+
)
4856
phi2 = chi2 / n_observations
4957

5058
# r and c are number of categories of x and y
5159
r, c = crosstab_matrix.shape
5260
if bias_correction:
5361
phi2_corrected = max(0, phi2 - ((r - 1) * (c - 1)) / (n_observations - 1))
54-
r_corrected = r - ((r - 1)**2) / (n_observations - 1)
55-
c_corrected = c - ((c - 1)**2) / (n_observations - 1)
62+
r_corrected = r - ((r - 1) ** 2) / (n_observations - 1)
63+
c_corrected = c - ((c - 1) ** 2) / (n_observations - 1)
5664
if Tschuprow:
57-
corr_coeff = np.sqrt(phi2_corrected / np.sqrt((r_corrected - 1)*(c_corrected - 1)))
65+
corr_coeff = np.sqrt(
66+
phi2_corrected / np.sqrt((r_corrected - 1) * (c_corrected - 1))
67+
)
5868
return corr_coeff
59-
corr_coeff = np.sqrt(phi2_corrected / min((r_corrected - 1), (c_corrected - 1)))
69+
corr_coeff = np.sqrt(
70+
phi2_corrected / min((r_corrected - 1), (c_corrected - 1))
71+
)
6072
return corr_coeff
6173
if Tschuprow:
62-
corr_coeff = np.sqrt(phi2 / np.sqrt((r - 1)*(c - 1)))
63-
return corr_coeff
74+
corr_coeff = np.sqrt(phi2 / np.sqrt((r - 1) * (c - 1)))
75+
return corr_coeff
6476
corr_coeff = np.sqrt(phi2 / min((r - 1), (c - 1)))
6577
return corr_coeff
66-
except:
67-
warnings.warn("Error calculating Cramer's V",RuntimeWarning)
78+
except Exception:
79+
warnings.warn("Error calculating Cramer's V", RuntimeWarning)
6880
return corr_coeff
6981

7082

71-
def corr_matrix(data: pd.DataFrame,
72-
columns: List,
73-
bias_correction: bool=True,
74-
Tschuprow: bool=False) -> pd.DataFrame:
83+
def corr_matrix(
84+
data: pd.DataFrame,
85+
columns: List,
86+
bias_correction: bool = True,
87+
Tschuprow: bool = False,
88+
) -> pd.DataFrame:
7589
"""
7690
Calculates correlation for all the columns provided and returns pandas like correlation matrix.
7791
The two measures supported are:
@@ -82,7 +96,7 @@ def corr_matrix(data: pd.DataFrame,
8296
-----------
8397
data : pandas DataFrame
8498
A pandas DataFrame containing the categorical columns
85-
columns : list
99+
columns : list
86100
A list of categorical columns
87101
bias_correction : Boolean, default = True
88102
Tschuprow : Boolean, default = False
@@ -92,31 +106,42 @@ def corr_matrix(data: pd.DataFrame,
92106
pandas dataframe object similar to pandas.DataFrame.corr()
93107
"""
94108
# checking length of columns
95-
if not columns.__len__()>0 or set(data.columns.values).intersection(columns).__len__()>0 :
109+
if (
110+
not columns.__len__() > 0
111+
or set(data.columns.values).intersection(columns).__len__() > 0
112+
):
96113
ValueError("Check the columns list provided")
97114

98115
target_data = data.filter(columns)
99116
cols = target_data.columns.values
100117
shape = target_data.columns.__len__()
101118

102119
matrix = np.zeros((shape, shape))
103-
for x,i in enumerate(cols):
120+
for x, i in enumerate(cols):
104121
temp = np.zeros((0, shape))
105122
for j in cols:
106-
temp = np.append(temp,corr(target_data[i], target_data[j], bias_correction=bias_correction, Tschuprow=Tschuprow))
123+
temp = np.append(
124+
temp,
125+
corr(
126+
target_data[i],
127+
target_data[j],
128+
bias_correction=bias_correction,
129+
Tschuprow=Tschuprow,
130+
),
131+
)
107132
matrix[x] = temp
108133

109-
corr_matrix = pd.DataFrame(data=matrix,
110-
index=cols,
111-
columns=cols)
134+
corr_matrix = pd.DataFrame(data=matrix, index=cols, columns=cols)
112135
return corr_matrix
113136

114-
def plot_corr(data: pd.DataFrame,
115-
columns: List,
116-
diagonal: str = False,
117-
bias_correction: bool=True,
118-
Tschuprow: bool=False
119-
) -> matplotlib.axes.Axes:
137+
138+
def plot_corr(
139+
data: pd.DataFrame,
140+
columns: List,
141+
diagonal: str = False,
142+
bias_correction: bool = True,
143+
Tschuprow: bool = False,
144+
) -> matplotlib.axes.Axes:
120145
"""
121146
Plots correlation matrix for all the columns provided and returns Matplotlib axes.
122147
The two measures supported are:
@@ -127,7 +152,7 @@ def plot_corr(data: pd.DataFrame,
127152
-----------
128153
data : pandas DataFrame
129154
A pandas DataFrame containing the categorical columns
130-
columns : list
155+
columns : list
131156
A list of categorical columns
132157
diagonal : string
133158
When true gives a masked version of heatmap
@@ -139,8 +164,10 @@ def plot_corr(data: pd.DataFrame,
139164
ax : matplotlib Axes
140165
Axes object with the heatmap.
141166
"""
142-
corr = corr_matrix(data, columns, bias_correction=bias_correction, Tschuprow=Tschuprow)
143-
if(diagonal):
167+
corr = corr_matrix(
168+
data, columns, bias_correction=bias_correction, Tschuprow=Tschuprow
169+
)
170+
if diagonal:
144171
mask = np.triu(corr)
145172
return sns.heatmap(corr, annot=True, mask=mask)
146-
return sns.heatmap(corr, annot=True)
173+
return sns.heatmap(corr, annot=True)

requirements.dev.in

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
black
2+
docutils
3+
flake8
4+
pip-tools
5+
pre-commit

requirements.dev.txt

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#
2+
# This file is autogenerated by pip-compile
3+
# To update, run:
4+
#
5+
# pip-compile --output-file=requirements.dev.txt requirements.dev.in
6+
#
7+
appdirs==1.4.4 # via black, virtualenv
8+
black==20.8b1 # via -r requirements.dev.in
9+
cfgv==3.2.0 # via pre-commit
10+
click==7.1.2 # via black, pip-tools
11+
dataclasses==0.7 # via black
12+
distlib==0.3.1 # via virtualenv
13+
docutils==0.16 # via -r requirements.dev.in
14+
filelock==3.0.12 # via virtualenv
15+
flake8==3.8.3 # via -r requirements.dev.in
16+
identify==1.5.0 # via pre-commit
17+
importlib-metadata==1.7.0 # via flake8, pre-commit, virtualenv
18+
importlib-resources==3.0.0 # via pre-commit, virtualenv
19+
mccabe==0.6.1 # via flake8
20+
mypy-extensions==0.4.3 # via black
21+
nodeenv==1.5.0 # via pre-commit
22+
pathspec==0.8.0 # via black
23+
pip-tools==5.3.1 # via -r requirements.dev.in
24+
pre-commit==2.7.1 # via -r requirements.dev.in
25+
pycodestyle==2.6.0 # via flake8
26+
pyflakes==2.2.0 # via flake8
27+
pyyaml==5.3.1 # via pre-commit
28+
regex==2020.7.14 # via black
29+
six==1.15.0 # via pip-tools, virtualenv
30+
toml==0.10.1 # via black, pre-commit
31+
typed-ast==1.4.1 # via black
32+
typing-extensions==3.7.4.3 # via black
33+
virtualenv==20.0.31 # via pre-commit
34+
zipp==3.1.0 # via importlib-metadata, importlib-resources
35+
36+
# The following packages are considered to be unsafe in a requirements file:
37+
# pip

requirements.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
matplotlib
2+
numpy
3+
pandas
4+
seaborn

requirements.txt

Lines changed: 17 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,17 @@
1-
bleach==3.1.5
2-
certifi==2020.4.5.1
3-
chardet==3.0.4
4-
cycler==0.10.0
5-
docutils==0.16
6-
idna==2.9
7-
keyring==21.2.1
8-
kiwisolver==1.2.0
9-
matplotlib==3.2.1
10-
numpy==1.18.4
11-
packaging==20.3
12-
pandas==1.0.3
13-
pkginfo==1.5.0.1
14-
Pygments==2.6.1
15-
pyparsing==2.4.7
16-
python-dateutil==2.8.1
17-
pytz==2020.1
18-
pywin32-ctypes==0.2.0
19-
readme-renderer==26.0
20-
requests==2.23.0
21-
requests-toolbelt==0.9.1
22-
scipy==1.4.1
23-
seaborn==0.10.1
24-
six==1.14.0
25-
tqdm==4.46.0
26-
twine==3.1.1
27-
urllib3==1.25.9
28-
webencodings==0.5.1
1+
#
2+
# This file is autogenerated by pip-compile
3+
# To update, run:
4+
#
5+
# pip-compile --output-file=requirements.txt requirements.in
6+
#
7+
cycler==0.10.0 # via matplotlib
8+
kiwisolver==1.2.0 # via matplotlib
9+
matplotlib==3.2.1 # via -r requirements.in, seaborn
10+
numpy==1.18.4 # via -r requirements.in, matplotlib, pandas, scipy, seaborn
11+
pandas==1.0.3 # via -r requirements.in, seaborn
12+
pyparsing==2.4.7 # via matplotlib
13+
python-dateutil==2.8.1 # via matplotlib, pandas
14+
pytz==2020.1 # via pandas
15+
scipy==1.4.1 # via seaborn
16+
seaborn==0.10.1 # via -r requirements.in
17+
six==1.14.0 # via cycler, python-dateutil

0 commit comments

Comments
 (0)