Merge pull request #170 from datarian/test-helper-refactoring

janmotl · web-flow · commit ae84e05c68a3 · 2019-03-21T14:11:17.000+01:00
Refactoring of test_helper
diff --git a/category_encoders/tests/helpers.py b/category_encoders/tests/helpers.py
@@ -0,0 +1,70 @@
+"""Helper functions that are used exclusively in the tests"""
+
+import numpy as np
+import random
+import pandas as pd
+import math
+
+
+def verify_numeric(X_test):
+    """
+    Test that all attributes in the DataFrame are numeric.
+    """
+    _NUMERIC_KINDS = set('buifc')
+    
+    for dt in X_test.dtypes:
+        assert(dt.kind in _NUMERIC_KINDS)
+
+def create_array(n_rows=1000, extras=False, has_none=True):
+    """
+    Creates a numpy dataset with some categorical variables.
+    """
+    ds = [[
+        random.random(),
+        random.random(),
+        random.choice(['A', 'B', 'C']),
+        random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']),
+        random.choice(['A', 'B', 'C', None, np.nan]) if has_none else random.choice(['A', 'B', 'C']),
+        random.choice(['A'])
+    ] for _ in range(n_rows)]
+
+    return np.array(ds)
+
+
+def create_dataset(n_rows=1000, extras=False, has_none=True):
+    """
+    Creates a dataset with some categorical variables.
+    """
+    random.seed(2001)
+    ds = [[
+        random.random(),                                                                        # Floats
+        random.choice([float('nan'), float('inf'), float('-inf'), -0, 0, 1, -1, math.pi]),      # Floats with edge scenarios
+        row,                                                                                    # Unique integers
+        str(row),                                                                               # Unique strings
+        random.choice(['A', 'B']) if extras else 'A',                                           # Invariant in the training data
+        random.choice(['A', 'B_b', 'C_c_c']),                                                   # Strings with underscores to test reverse_dummies()
+        random.choice(['A', 'B', 'C', None]) if has_none else random.choice(['A', 'B', 'C']),   # None
+        random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']),      # With a new string value
+        random.choice([12, 43, -32]),                                                           # Number in the column name
+        random.choice(['A', 'B', 'C']),                                                         # What is going to become the categorical column
+        random.choice(['A', 'B', 'C', np.nan])                                                  # Categorical with missing values
+    ] for row in range(n_rows)]
+
+    df = pd.DataFrame(ds, columns=['float', 'float_edge', 'unique_int', 'unique_str', 'invariant', 'underscore', 'none', 'extra', 321, 'categorical', 'categorical_na'])
+    df['categorical'] = pd.Categorical(df['categorical'], categories=['A', 'B', 'C'])
+    df['categorical_na'] = pd.Categorical(df['categorical_na'], categories=['A', 'B', 'C'])
+    return df
+
+
+def verify_inverse_transform(x, x_inv):
+    """
+    Verify x is equal to x_inv. The test returns true for NaN.equals(NaN) as it should.
+    """
+    assert x.equals(x_inv)
+
+
+def deep_round(A, ndigits=5):
+    """
+    Rounds numbers in a list of lists. Useful for approximate equality testing.
+    """
+    return [[round(val, ndigits) for val in sublst] for sublst in A]
diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 import sklearn
-import category_encoders.tests.test_helpers as th
+import category_encoders.tests.helpers as th
 from sklearn.utils.estimator_checks import check_transformer_general, check_transformers_unfitted
 from unittest2 import TestSuite, TextTestRunner, TestCase  # or `from unittest import ...` if on Python 3.4+
 
diff --git a/category_encoders/tests/test_helpers.py b/category_encoders/tests/test_helpers.py
@@ -1,70 +1,42 @@
-"""Helper functions that are used exclusively in the tests"""
-
 import numpy as np
-import random
 import pandas as pd
-import math
-
+from unittest2 import TestCase  # or `from unittest import ...` if on Python 3.4+
 
-def verify_numeric(X_test):
-    """
-    Test that all attributes in the DataFrame are numeric.
-    """
-    for dt in X_test.dtypes:
-        numeric = False
-        if np.issubdtype(dt, np.dtype(int)) or np.issubdtype(dt, np.dtype(float)):
-            numeric = True
-        assert numeric
+from category_encoders.tests.helpers import verify_numeric
 
 
-def create_array(n_rows=1000, extras=False, has_none=True):
-    """
-    Creates a numpy dataset with some categorical variables.
-    """
-    ds = [[
-        random.random(),
-        random.random(),
-        random.choice(['A', 'B', 'C']),
-        random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']),
-        random.choice(['A', 'B', 'C', None, np.nan]) if has_none else random.choice(['A', 'B', 'C']),
-        random.choice(['A'])
-    ] for _ in range(n_rows)]
+class TestHelpers(TestCase):
 
-    return np.array(ds)
+    def test_is_numeric_pandas(self):
+        # Whole numbers, regardless of the byte length, should not raise AssertionError
+        X = pd.DataFrame(np.ones([5, 5]), dtype='int32')
+        verify_numeric(pd.DataFrame(X))
 
+        X = pd.DataFrame(np.ones([5, 5]), dtype='int64')
+        verify_numeric(pd.DataFrame(X))
 
-def create_dataset(n_rows=1000, extras=False, has_none=True):
-    """
-    Creates a dataset with some categorical variables.
-    """
-    random.seed(2001)
-    ds = [[
-        random.random(),                                                                        # Floats
-        random.choice([float('nan'), float('inf'), float('-inf'), -0, 0, 1, -1, math.pi]),      # Floats with edge scenarios
-        row,                                                                                    # Unique integers
-        str(row),                                                                               # Unique strings
-        random.choice(['A', 'B']) if extras else 'A',                                           # Invariant in the training data
-        random.choice(['A', 'B_b', 'C_c_c']),                                                   # Strings with underscores to test reverse_dummies()
-        random.choice(['A', 'B', 'C', None]) if has_none else random.choice(['A', 'B', 'C']),   # None
-        random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']),      # With a new string value
-        random.choice([12, 43, -32]),                                                           # Number in the column name
-        random.choice(['A', 'B', 'C']),                                                         # What is going to become the categorical column
-    ] for row in range(n_rows)]
+        # Strings should raise AssertionError
+        X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']])
+        with self.assertRaises(Exception):
+            verify_numeric(pd.DataFrame(X))
 
-    df = pd.DataFrame(ds, columns=['float', 'float_edge', 'unique_int', 'unique_str', 'invariant', 'underscore', 'none', 'extra', 321, 'categorical'])
-    df['categorical'] = pd.Categorical(df['categorical'], categories=['A', 'B', 'C'])
-    return df
+    def test_is_numeric_numpy(self):
+        # Whole numbers, regardless of the byte length, should not raise AssertionError
+        X = np.ones([5, 5], dtype='int32')
+        verify_numeric(pd.DataFrame(X))
 
+        X = np.ones([5, 5], dtype='int64')
+        verify_numeric(pd.DataFrame(X))
 
-def verify_inverse_transform(x, x_inv):
-    """
-    Verify x is equal to x_inv. The test returns true for NaN.equals(NaN) as it should.
-    """
-    assert x.equals(x_inv)
+        # Floats
+        X = np.ones([5, 5], dtype='float32')
+        verify_numeric(pd.DataFrame(X))
 
+        X = np.ones([5, 5], dtype='float64')
+        verify_numeric(pd.DataFrame(X))
 
-def deep_round(A, ndigits=5):
-    """
-    Rounds numbers in a list of lists. Useful for approximate equality testing.
-    """
-    return [[round(val, ndigits) for val in sublst] for sublst in A]
+    def test_verify_raises_AssertionError_on_categories(self):
+        # Categories should raise AssertionError
+        X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']], dtype='category')
+        with self.assertRaises(Exception):
+            verify_numeric(pd.DataFrame(X))
diff --git a/category_encoders/tests/test_leave_one_out.py b/category_encoders/tests/test_leave_one_out.py
@@ -1,6 +1,6 @@
 import pandas as pd
 from unittest2 import TestCase  # or `from unittest import ...` if on Python 3.4+
-import category_encoders.tests.test_helpers as th
+import category_encoders.tests.helpers as th
 import numpy as np
 
 import category_encoders as encoders
diff --git a/category_encoders/tests/test_one_hot.py b/category_encoders/tests/test_one_hot.py
@@ -1,8 +1,8 @@
 import pandas as pd
-from unittest import TestCase  # or `from unittest import ...` if on Python 3.4+
+from unittest2 import TestCase  # or `from unittest import ...` if on Python 3.4+
 import numpy as np
 import warnings
-import category_encoders.tests.test_helpers as th
+import category_encoders.tests.helpers as th
 
 import category_encoders as encoders
 
diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py
@@ -1,6 +1,6 @@
 import pandas as pd
 from unittest2 import TestCase  # or `from unittest import ...` if on Python 3.4+
-import category_encoders.tests.test_helpers as th
+import category_encoders.tests.helpers as th
 import numpy as np
 import warnings
 import category_encoders as encoders
@@ -35,7 +35,7 @@ def test_ordinal(self):
         self.assertIn(-1, set(out['extra'].values))
         self.assertTrue(len(enc.mapping) > 0)
 
-        enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='return _nan')
+        enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='return_nan')
         enc.fit(X)
         out = enc.transform(X_t)
         out_cats = [x for x in set(out['extra'].values) if np.isfinite(x)]
diff --git a/category_encoders/tests/test_target_encoder.py b/category_encoders/tests/test_target_encoder.py
@@ -1,6 +1,6 @@
 import pandas as pd
 from unittest2 import TestCase  # or `from unittest import ...` if on Python 3.4+
-import category_encoders.tests.test_helpers as th
+import category_encoders.tests.helpers as th
 import numpy as np
 
 import category_encoders as encoders
diff --git a/category_encoders/tests/test_woe.py b/category_encoders/tests/test_woe.py
@@ -1,6 +1,6 @@
 import pandas as pd
 from unittest2 import TestCase  # or `from unittest import ...` if on Python 3.4+
-import category_encoders.tests.test_helpers as th
+import category_encoders.tests.helpers as th
 import numpy as np
 
 import category_encoders as encoders