Linting, unit test, and yates check (#3)

dafrenchyman · web-flow · commit 23213e573f29 · 2020-09-13T20:54:45.000+05:30
* Adding unit tests

* Redoing requirements to be cleaner. Also added pre-commit and auto-linting.

* flake8 and black done on setup

* Linting and Yate's correction factor check on pycorrcat

* Added an extra unit test to check the correlation matrix output.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501, W503
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# IDE
+.idea/
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,2 @@
+[settings]
+known_third_party = matplotlib,numpy,pandas,pip,scipy,seaborn,setuptools
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,22 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 20.8b1
+    hooks:
+      - id: black
+        language_version: python3
+  - repo: https://github.com/asottile/seed-isort-config
+    rev: v2.2.0
+    hooks:
+      - id: seed-isort-config
+  - repo: https://github.com/timothycrosley/isort
+    rev: 5.5.1
+    hooks:
+      - id: isort
+  - repo: https://github.com/Lucas-C/pre-commit-hooks-safety
+    rev: v1.1.3
+    hooks:
+      - id: python-safety-dependencies-check
+  - repo: https://gitlab.com/pycqa/flake8
+    rev: 3.8.3
+    hooks:
+      - id: flake8
diff --git a/README.md b/README.md
@@ -16,7 +16,14 @@ plot_corr(df, ['dogs','cats'] )
 ```
 
 ## Development setup
-Create a virtualenv and install dependencies from ```requirements.txt``` and continue with code change.
+Create a virtualenv and install dependencies:
+- `pip install -r requirements.dev.txt`
+- `pip install -r requirements.txt`
+Then install the pre-commit hooks: `pre-commit install` and continue with code change.
+
+### Run `pre-commit` locally to check files
+
+`pre-commit run --all-files`
 
 ## Release History
 
diff --git a/pycorrcat/pycorrcat.py b/pycorrcat/pycorrcat.py
@@ -1,29 +1,29 @@
 import warnings
-import pandas as pd
-import scipy.stats as stats
-import numpy as np
 from typing import List
+
 import matplotlib as matplotlib
+import numpy as np
+import pandas as pd
+import scipy.stats as stats
 import seaborn as sns
 
+
 def fillna(object):
     if isinstance(object, pd.Series):
         return object.fillna(0)
     else:
         return np.array([value if value is not None else 0 for value in object])
 
-def corr(x, 
-         y,
-         bias_correction=True,
-         Tschuprow=False):
+
+def corr(x, y, bias_correction=True, Tschuprow=False):
     """
     Calculates correlation statistic for categorical-categorical association.
     The two measures supported are:
     1. Cramer'V ( default )
     2. Tschuprow'T
 
     Bias correction and formula's taken from : https://www.researchgate.net/publication/270277061_A_bias-correction_for_Cramer's_V_and_Tschuprow's_T
-    
+
     Wikipedia for Cramer's V: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
     Wikipedia for Tschuprow' T: https://en.wikipedia.org/wiki/Tschuprow%27s_T
     Parameters:
@@ -44,34 +44,48 @@ def corr(x,
         x, y = fillna(x), fillna(y)
         crosstab_matrix = pd.crosstab(x, y)
         n_observations = crosstab_matrix.sum().sum()
-        chi2, p, dof, expected = stats.chi2_contingency(crosstab_matrix)
+
+        yates_correct = True
+        if bias_correction:
+            if crosstab_matrix.shape == (2, 2):
+                yates_correct = False
+
+        chi2, _, _, _ = stats.chi2_contingency(
+            crosstab_matrix, correction=yates_correct
+        )
         phi2 = chi2 / n_observations
 
         # r and c are number of categories of x and y
         r, c = crosstab_matrix.shape
         if bias_correction:
             phi2_corrected = max(0, phi2 - ((r - 1) * (c - 1)) / (n_observations - 1))
-            r_corrected  = r - ((r - 1)**2) / (n_observations - 1)
-            c_corrected = c - ((c - 1)**2) / (n_observations - 1)
+            r_corrected = r - ((r - 1) ** 2) / (n_observations - 1)
+            c_corrected = c - ((c - 1) ** 2) / (n_observations - 1)
             if Tschuprow:
-                corr_coeff = np.sqrt(phi2_corrected / np.sqrt((r_corrected - 1)*(c_corrected - 1)))
+                corr_coeff = np.sqrt(
+                    phi2_corrected / np.sqrt((r_corrected - 1) * (c_corrected - 1))
+                )
                 return corr_coeff
-            corr_coeff = np.sqrt(phi2_corrected / min((r_corrected - 1), (c_corrected - 1)))
+            corr_coeff = np.sqrt(
+                phi2_corrected / min((r_corrected - 1), (c_corrected - 1))
+            )
             return corr_coeff
         if Tschuprow:
-            corr_coeff = np.sqrt(phi2 / np.sqrt((r - 1)*(c - 1)))
-            return corr_coeff    
+            corr_coeff = np.sqrt(phi2 / np.sqrt((r - 1) * (c - 1)))
+            return corr_coeff
         corr_coeff = np.sqrt(phi2 / min((r - 1), (c - 1)))
         return corr_coeff
-    except:
-        warnings.warn("Error calculating Cramer's V",RuntimeWarning)
+    except Exception:
+        warnings.warn("Error calculating Cramer's V", RuntimeWarning)
         return corr_coeff
 
 
-def corr_matrix(data: pd.DataFrame, 
-                columns: List,
-                bias_correction: bool=True,
-                Tschuprow: bool=False) -> pd.DataFrame:
+def corr_matrix(
+    data: pd.DataFrame,
+    columns: List,
+    bias_correction: bool = True,
+    Tschuprow: bool = False,
+) -> pd.DataFrame:
     """
     Calculates correlation for all the columns provided and returns pandas like correlation matrix.
     The two measures supported are:
@@ -82,7 +96,7 @@ def corr_matrix(data: pd.DataFrame,
     -----------
     data : pandas DataFrame
         A pandas DataFrame containing the categorical columns
-    columns : list 
+    columns : list
         A list of categorical columns
     bias_correction : Boolean, default = True
     Tschuprow : Boolean, default = False
@@ -92,31 +106,42 @@ def corr_matrix(data: pd.DataFrame,
     pandas dataframe object similar to pandas.DataFrame.corr()
     """
     # checking length of columns
-    if not columns.__len__()>0 or set(data.columns.values).intersection(columns).__len__()>0 :
+    if (
+        not columns.__len__() > 0
+        or set(data.columns.values).intersection(columns).__len__() > 0
+    ):
         ValueError("Check the columns list provided")
 
     target_data = data.filter(columns)
     cols = target_data.columns.values
     shape = target_data.columns.__len__()
 
     matrix = np.zeros((shape, shape))
-    for x,i in enumerate(cols):
+    for x, i in enumerate(cols):
         temp = np.zeros((0, shape))
         for j in cols:
-            temp = np.append(temp,corr(target_data[i], target_data[j], bias_correction=bias_correction, Tschuprow=Tschuprow))
+            temp = np.append(
+                temp,
+                corr(
+                    target_data[i],
+                    target_data[j],
+                    bias_correction=bias_correction,
+                    Tschuprow=Tschuprow,
+                ),
+            )
         matrix[x] = temp
 
-    corr_matrix = pd.DataFrame(data=matrix,
-                            index=cols,
-                            columns=cols)
+    corr_matrix = pd.DataFrame(data=matrix, index=cols, columns=cols)
     return corr_matrix
 
-def plot_corr(data: pd.DataFrame,
-              columns: List,
-              diagonal: str = False,
-              bias_correction: bool=True,
-              Tschuprow: bool=False
-            ) -> matplotlib.axes.Axes:
+
+def plot_corr(
+    data: pd.DataFrame,
+    columns: List,
+    diagonal: str = False,
+    bias_correction: bool = True,
+    Tschuprow: bool = False,
+) -> matplotlib.axes.Axes:
     """
     Plots correlation matrix for all the columns provided and returns Matplotlib axes.
     The two measures supported are:
@@ -127,7 +152,7 @@ def plot_corr(data: pd.DataFrame,
     -----------
     data : pandas DataFrame
         A pandas DataFrame containing the categorical columns
-    columns : list 
+    columns : list
         A list of categorical columns
     diagonal :  string
         When true gives a masked version of heatmap
@@ -139,8 +164,10 @@ def plot_corr(data: pd.DataFrame,
     ax : matplotlib Axes
     Axes object with the heatmap.
     """
-    corr = corr_matrix(data, columns, bias_correction=bias_correction, Tschuprow=Tschuprow)
-    if(diagonal):
+    corr = corr_matrix(
+        data, columns, bias_correction=bias_correction, Tschuprow=Tschuprow
+    )
+    if diagonal:
         mask = np.triu(corr)
         return sns.heatmap(corr, annot=True, mask=mask)
-    return sns.heatmap(corr, annot=True)
+    return sns.heatmap(corr, annot=True)
diff --git a/requirements.dev.in b/requirements.dev.in
@@ -0,0 +1,5 @@
+black
+docutils
+flake8
+pip-tools
+pre-commit
diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -0,0 +1,37 @@
+#
+# This file is autogenerated by pip-compile
+# To update, run:
+#
+#    pip-compile --output-file=requirements.dev.txt requirements.dev.in
+#
+appdirs==1.4.4            # via black, virtualenv
+black==20.8b1             # via -r requirements.dev.in
+cfgv==3.2.0               # via pre-commit
+click==7.1.2              # via black, pip-tools
+dataclasses==0.7          # via black
+distlib==0.3.1            # via virtualenv
+docutils==0.16            # via -r requirements.dev.in
+filelock==3.0.12          # via virtualenv
+flake8==3.8.3             # via -r requirements.dev.in
+identify==1.5.0           # via pre-commit
+importlib-metadata==1.7.0  # via flake8, pre-commit, virtualenv
+importlib-resources==3.0.0  # via pre-commit, virtualenv
+mccabe==0.6.1             # via flake8
+mypy-extensions==0.4.3    # via black
+nodeenv==1.5.0            # via pre-commit
+pathspec==0.8.0           # via black
+pip-tools==5.3.1          # via -r requirements.dev.in
+pre-commit==2.7.1         # via -r requirements.dev.in
+pycodestyle==2.6.0        # via flake8
+pyflakes==2.2.0           # via flake8
+pyyaml==5.3.1             # via pre-commit
+regex==2020.7.14          # via black
+six==1.15.0               # via pip-tools, virtualenv
+toml==0.10.1              # via black, pre-commit
+typed-ast==1.4.1          # via black
+typing-extensions==3.7.4.3  # via black
+virtualenv==20.0.31       # via pre-commit
+zipp==3.1.0               # via importlib-metadata, importlib-resources
+
+# The following packages are considered to be unsafe in a requirements file:
+# pip
diff --git a/requirements.in b/requirements.in
@@ -0,0 +1,4 @@
+matplotlib
+numpy
+pandas
+seaborn
diff --git a/requirements.txt b/requirements.txt
@@ -1,28 +1,17 @@
-bleach==3.1.5
-certifi==2020.4.5.1
-chardet==3.0.4
-cycler==0.10.0
-docutils==0.16
-idna==2.9
-keyring==21.2.1
-kiwisolver==1.2.0
-matplotlib==3.2.1
-numpy==1.18.4
-packaging==20.3
-pandas==1.0.3
-pkginfo==1.5.0.1
-Pygments==2.6.1
-pyparsing==2.4.7
-python-dateutil==2.8.1
-pytz==2020.1
-pywin32-ctypes==0.2.0
-readme-renderer==26.0
-requests==2.23.0
-requests-toolbelt==0.9.1
-scipy==1.4.1
-seaborn==0.10.1
-six==1.14.0
-tqdm==4.46.0
-twine==3.1.1
-urllib3==1.25.9
-webencodings==0.5.1
+#
+# This file is autogenerated by pip-compile
+# To update, run:
+#
+#    pip-compile --output-file=requirements.txt requirements.in
+#
+cycler==0.10.0            # via matplotlib
+kiwisolver==1.2.0         # via matplotlib
+matplotlib==3.2.1         # via -r requirements.in, seaborn
+numpy==1.18.4             # via -r requirements.in, matplotlib, pandas, scipy, seaborn
+pandas==1.0.3             # via -r requirements.in, seaborn
+pyparsing==2.4.7          # via matplotlib
+python-dateutil==2.8.1    # via matplotlib, pandas
+pytz==2020.1              # via pandas
+scipy==1.4.1              # via seaborn
+seaborn==0.10.1           # via -r requirements.in
+six==1.14.0               # via cycler, python-dateutil
diff --git a/setup.py b/setup.py
@@ -1,26 +1,27 @@
 # -*- coding: utf-8 -*-
 
-from setuptools import setup, find_packages
-from pip._internal.req import parse_requirements
 from os import path
 
+from pip._internal.req import parse_requirements
+from setuptools import find_packages, setup
+
 this_directory = path.abspath(path.dirname(__file__))
-with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
+with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 
 setup(
-    name='pycorr',
-    version='0.1.4',
-    description='Python package for calculating correlation amongst categorical variables',
-    long_description_content_type='text/markdown',
+    name="pycorr",
+    version="0.1.4",
+    description="Python package for calculating correlation amongst categorical variables",
+    long_description_content_type="text/markdown",
     long_description=long_description,
-    author='Anurag Kumar Mishra',
-    author_email='anuragkm25@outlook.com',
-    url='https://github.com/MavericksDS/pycorr',
-    packages=find_packages(exclude=('tests', 'docs')),
-    install_reqs = parse_requirements('requirements.txt', session='hack'),
+    author="Anurag Kumar Mishra",
+    author_email="anuragkm25@outlook.com",
+    url="https://github.com/MavericksDS/pycorr",
+    packages=find_packages(exclude=("tests", "docs")),
+    install_reqs=parse_requirements("requirements.txt", session="hack"),
     classifiers=[
         "Programming Language :: Python :: 3.7",
         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
-    ]
-)
+    ],
+)
diff --git a/tests/pycorrcat_test.py b/tests/pycorrcat_test.py

-Original file line number
+Diff line change
 # Pyre type checker
 .pyre/
++
 +# IDE
 +.idea/
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+[settings]`
	`2`	`+known_third_party = matplotlib,numpy,pandas,pip,scipy,seaborn,setuptools`