Fix test for datasets submodule (#151)

maks-sh · web-flow · commit 45c38b3ef8b7 · 2021-08-13T01:49:08.000+03:00
* 🐛 Fix bug incorrect value

* ♻️ Add test for fetch_megafon

* 📝 Fix example in docstring fetch_megafon

* 📝 Fix megafon test

* 📝 Fix ci pipe

* 📝 Fix ci pipe: add req.txt while sphinx build

* 📝 Clear test dir
diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
@@ -2,6 +2,7 @@ name: Python package
 
 on:
   push:
+    branches: [ master ]
   pull_request_target:
 
 jobs:
@@ -10,12 +11,13 @@ jobs:
     runs-on: ${{ matrix.os }}
     env:
       USING_COVERAGE_PY: '3.8'
-      USING_COVERAGE_OS: 'ubuntu-latest'
+      USING_COVERAGE_OS: 'macos-latest'
 
     strategy:
       matrix:
         os: ['ubuntu-latest', 'windows-latest', 'macos-latest']
         python-version: ['3.6', '3.7', '3.8', '3.9']
+        platform: 'x64'
       fail-fast: false
 
     steps:
@@ -51,6 +53,6 @@ jobs:
     - name: Update pip
       run: python -m pip install --upgrade pip
     - name: Install dependencies
-      run: pip install -r docs/requirements.txt
+      run: pip install -r docs/requirements.txt -r requirements.txt
     - name: Run Sphinx
-      run: sphinx-build -b html docs /tmp/_docs_build
+      run: sphinx-build -W -b html docs /tmp/_docs_build
diff --git a/sklift/datasets/datasets.py b/sklift/datasets/datasets.py
@@ -381,15 +381,15 @@ def fetch_criteo(target_col='visit', treatment_col='treatment', data_home=None,
     if treatment_col == 'all':
         treatment_col = treatment_cols
     elif treatment_col not in treatment_cols:
-        raise ValueError(f"treatment_col value must be in {treatment_cols + ['all']}. "
-                         f"Got value {treatment_col}.")
+        raise ValueError(f"The treatment_col must be an element of {treatment_cols + ['all']}. "
+                         f"Got value target_col={treatment_col}.")
 
     target_cols = ['visit', 'conversion']
     if target_col == 'all':
         target_col = target_cols
     elif target_col not in target_cols:
-        raise ValueError(f"target_col value must be from {target_cols + ['all']}. "
-                         f"Got value {target_col}.")
+        raise ValueError(f"The target_col must be an element of {target_cols + ['all']}. "
+                         f"Got value target_col={target_col}.")
 
     if percent10:
         url = 'https://criteo-bucket.s3.eu-central-1.amazonaws.com/criteo10.csv.gz'
@@ -494,8 +494,8 @@ def fetch_hillstrom(target_col='visit', data_home=None, dest_subdir=None, downlo
     if target_col == 'all':
         target_col = target_cols
     elif target_col not in target_cols:
-        raise ValueError(f"target_col value must be from {target_cols + ['all']}. "
-                         f"Got value {target_col + ['all']}.")
+        raise ValueError(f"The target_col must be an element of {target_cols + ['all']}. "
+                         f"Got value target_col={target_col}.")
 
     url = 'https://hillstorm1.s3.us-east-2.amazonaws.com/hillstorm_no_indices.csv.gz'
     filename = url.split('/')[-1]
@@ -566,7 +566,7 @@ def fetch_megafon(data_home=None, dest_subdir=None, download_if_missing=True,
 
 
         dataset = fetch_megafon()
-        data, treatment, target = dataset.data, dataset.treatment, dataset.target
+        data, target, treatment = dataset.data, dataset.target, dataset.treatment
 
         # alternative option
         data, target, treatment = fetch_megafon(return_X_y_t=True)
diff --git a/sklift/tests/test_datasets.py b/sklift/tests/test_datasets.py
@@ -3,19 +3,20 @@
 
 from functools import partial
 
-from ..datasets import (fetch_lenta, fetch_x5,
-                        fetch_criteo, fetch_hillstrom)
+from ..datasets import (
+    clear_data_dir,
+    fetch_lenta, fetch_x5,
+    fetch_criteo, fetch_hillstrom,
+    fetch_megafon
+)
 
 
 fetch_criteo10 = partial(fetch_criteo, percent10=True)
 
-
-def check_return_X_y_t(bunch, dataset_func):
-    X_y_t_tuple = dataset_func(return_X_y_t=True)
-    assert isinstance(X_y_t_tuple, tuple)
-    assert X_y_t_tuple[0].shape == bunch.data.shape
-    assert X_y_t_tuple[1].shape == bunch.target.shape
-    assert X_y_t_tuple[2].shape == bunch.treatment.shape
+@pytest.fixture(scope="session", autouse=True)
+def clear():
+    # prepare something ahead of all tests
+    clear_data_dir()
 
 
 @pytest.fixture
@@ -53,20 +54,11 @@ def test_fetch_x5(x5_dataset):
     assert data.treatment.shape == x5_dataset['treatment.shape']
 
 
-@pytest.mark.parametrize(
-    'target_col, target_shape',
-    [('visit', (64_000,)),
-     ('conversion', (64_000,)),
-     ('spend', (64_000,)),
-     ('all', (64_000, 3))]
-)
-def test_fetch_hillstrom(
-    target_col, target_shape
-):
-    data = fetch_hillstrom(target_col=target_col)
-    assert data.data.shape == (64_000, 8)
-    assert data.target.shape == target_shape
-    assert data.treatment.shape == (64_000,)
+@pytest.fixture
+def criteo10_dataset() -> dict:
+    data = {'keys': ['data', 'target', 'treatment', 'DESCR', 'feature_names', 'target_name', 'treatment_name'],
+            'data.shape': (1397960, 12)}
+    return data
 
 
 @pytest.mark.parametrize(
@@ -82,15 +74,69 @@ def test_fetch_hillstrom(
      ('all', (1397960, 2))]
 )
 def test_fetch_criteo10(
-    target_col, target_shape, treatment_col, treatment_shape
+        criteo10_dataset,
+        target_col, target_shape,
+        treatment_col, treatment_shape
 ):
     data = fetch_criteo10(target_col=target_col, treatment_col=treatment_col)
-    assert data.data.shape == (1397960, 12)
+    assert isinstance(data, sklearn.utils.Bunch)
+    assert set(data.keys()) == set(criteo10_dataset['keys'])
+    assert data.data.shape == criteo10_dataset['data.shape']
     assert data.target.shape == target_shape
     assert data.treatment.shape == treatment_shape
 
 
-@pytest.mark.parametrize("fetch_func", [fetch_hillstrom, fetch_criteo10, fetch_lenta])
+@pytest.fixture
+def hillstrom_dataset() -> dict:
+    data = {'keys': ['data', 'target', 'treatment', 'DESCR', 'feature_names', 'target_name', 'treatment_name'],
+            'data.shape': (64000, 8), 'treatment.shape': (64000,)}
+    return data
+
+
+@pytest.mark.parametrize(
+    'target_col, target_shape',
+    [('visit', (64_000,)),
+     ('conversion', (64_000,)),
+     ('spend', (64_000,)),
+     ('all', (64_000, 3))]
+)
+def test_fetch_hillstrom(
+        hillstrom_dataset,
+        target_col, target_shape
+):
+    data = fetch_hillstrom(target_col=target_col)
+    assert isinstance(data, sklearn.utils.Bunch)
+    assert set(data.keys()) == set(hillstrom_dataset['keys'])
+    assert data.data.shape == hillstrom_dataset['data.shape']
+    assert data.target.shape == target_shape
+    assert data.treatment.shape == hillstrom_dataset['treatment.shape']
+
+
+@pytest.fixture
+def megafon_dataset() -> dict:
+    data = {'keys': ['data', 'target', 'treatment', 'DESCR', 'feature_names', 'target_name', 'treatment_name'],
+            'data.shape': (600000, 50), 'target.shape': (600000,), 'treatment.shape': (600000,)}
+    return data
+
+
+def test_fetch_megafon(megafon_dataset):
+    data = fetch_megafon()
+    assert isinstance(data, sklearn.utils.Bunch)
+    assert set(data.keys()) == set(megafon_dataset['keys'])
+    assert data.data.shape == megafon_dataset['data.shape']
+    assert data.target.shape == megafon_dataset['target.shape']
+    assert data.treatment.shape == megafon_dataset['treatment.shape']
+
+
+def check_return_X_y_t(bunch, dataset_func):
+    X_y_t_tuple = dataset_func(return_X_y_t=True)
+    assert isinstance(X_y_t_tuple, tuple)
+    assert X_y_t_tuple[0].shape == bunch.data.shape
+    assert X_y_t_tuple[1].shape == bunch.target.shape
+    assert X_y_t_tuple[2].shape == bunch.treatment.shape
+
+
+@pytest.mark.parametrize("fetch_func", [fetch_hillstrom, fetch_criteo10, fetch_lenta, fetch_megafon])
 def test_return_X_y_t(fetch_func):
     data = fetch_func()
     check_return_X_y_t(data, fetch_func)