Merge branch 'dev' of github.com:maks-sh/scikit-uplift into dev

maks-sh · maks-sh · commit a14068949228 · 2022-06-17T13:09:24.000+03:00
diff --git a/docs/api/datasets/fetch_criteo.rst b/docs/api/datasets/fetch_criteo.rst
@@ -6,4 +6,17 @@
 
 .. autofunction:: sklift.datasets.datasets.fetch_criteo
 
+About the company
+##################
+
+Criteo is an advertising company that provides online display advertisements. 
+The company was founded and is headquartered in Paris, France. Criteo's product is a form of display advertising, 
+which displays interactive banner advertisements, generated based on the online browsing preferences and behaviour for each customer. 
+The solution operates on a pay per click/cost per click (CPC) basis.
+
+.. figure:: https://upload.wikimedia.org/wikipedia/commons/d/d2/Criteo_logo21.svg
+
+Link to the company's website: https://www.criteo.com/
+
+
 .. include:: ../../../sklift/datasets/descr/criteo.rst
diff --git a/docs/api/datasets/fetch_hillstrom.rst b/docs/api/datasets/fetch_hillstrom.rst
@@ -6,4 +6,12 @@
 
 .. autofunction:: sklift.datasets.datasets.fetch_hillstrom
 
+About the company
+##################
+
+The dataset was provided by Kevin Hillstorm. 
+Kevin is President of MineThatData, a consultancy that helps CEOs understand the complex relationship between Customers, Advertising, Products, Brands, and Channels.
+
+Link to the blog website: https://blog.minethatdata.com/
+
 .. include:: ../../../sklift/datasets/descr/hillstrom.rst
diff --git a/docs/api/datasets/fetch_lenta.rst b/docs/api/datasets/fetch_lenta.rst
@@ -6,4 +6,14 @@
 
 .. autofunction:: sklift.datasets.datasets.fetch_lenta
 
+About the company
+##################
+
+Lenta (Russian: Лентa) is a Russian super - and hypermarket chain. With 149 locations across the country, 
+it is one of Russia's largest retail chains in addition to being the country's second largest hypermarket chain.
+
+.. figure:: https://upload.wikimedia.org/wikipedia/commons/7/73/Lenta_logo.svg
+
+Link to the company's website: https://www.lenta.com/
+
 .. include:: ../../../sklift/datasets/descr/lenta.rst
diff --git a/docs/api/datasets/fetch_megafon.rst b/docs/api/datasets/fetch_megafon.rst
@@ -6,4 +6,14 @@
 
 .. autofunction:: sklift.datasets.datasets.fetch_megafon
 
+About the company
+##################
+
+MegaFon (Russian: МегаФон), previously known as North-West GSM, is the second largest mobile phone operator and the third largest telecom operator in Russia. 
+It works in the GSM, UMTS and LTE standard. As of June 2012, the company serves 62.1 million subscribers in Russia and 1.6 million in Tajikistan. It is headquartered in Moscow.
+
+.. figure:: https://upload.wikimedia.org/wikipedia/commons/9/9e/MegaFon_logo.svg
+
+Link to the company's website: https://megafon.ru/
+
 .. include:: ../../../sklift/datasets/descr/megafon.rst
diff --git a/docs/api/datasets/fetch_x5.rst b/docs/api/datasets/fetch_x5.rst
@@ -6,4 +6,16 @@
 
 .. autofunction:: sklift.datasets.datasets.fetch_x5
 
+About the company
+##################
+
+X5 Group is a leading Russian food retailer. 
+The Company operates several retail formats: proximity stores under the Pyaterochka brand, 
+supermarkets under the Perekrestok brand and hypermarkets under the Karusel brand, as well as the Perekrestok.ru online market, 
+the 5Post parcel and Dostavka.Pyaterochka and Perekrestok. Bystro food delivery services.
+
+.. figure:: https://upload.wikimedia.org/wikipedia/en/8/83/X5_Retail_Group_logo_2015.png
+
+Link to the company's website: https://www.x5.ru/
+
 .. include:: ../../../sklift/datasets/descr/x5.rst
diff --git a/sklift/tests/test_datasets.py b/sklift/tests/test_datasets.py
@@ -34,24 +34,23 @@ def test_fetch_lenta(lenta_dataset):
     assert data.target.shape == lenta_dataset['target.shape']
     assert data.treatment.shape == lenta_dataset['treatment.shape']
 
-
-# @pytest.fixture
-# def x5_dataset() -> dict:
-#     data = {'keys': ['data', 'target', 'treatment', 'DESCR', 'feature_names', 'target_name', 'treatment_name'],
+#@pytest.fixture
+#def x5_dataset() -> dict:
+#	data = {'keys': ['data', 'target', 'treatment', 'DESCR', 'feature_names', 'target_name', 'treatment_name'],
 #             'data.keys': ['clients', 'train', 'purchases'], 'clients.shape': (400162, 5),
-#             'train.shape': (200039, 1), 'target.shape': (200039,), 'treatment.shape': (200039,)}
-#     return data
-#
+#            'train.shape': (200039, 1), 'target.shape': (200039,), 'treatment.shape': (200039,)}
+#	return data
+
 #
-# def test_fetch_x5(x5_dataset):
-#     data = fetch_x5()
-#     assert isinstance(data, sklearn.utils.Bunch)
-#     assert set(data.keys()) == set(x5_dataset['keys'])
-#     assert set(data.data.keys()) == set(x5_dataset['data.keys'])
-#     assert data.data.clients.shape == x5_dataset['clients.shape']
-#     assert data.data.train.shape == x5_dataset['train.shape']
-#     assert data.target.shape == x5_dataset['target.shape']
-#     assert data.treatment.shape == x5_dataset['treatment.shape']
+#def test_fetch_x5(x5_dataset):
+#	data = fetch_x5()
+#	assert isinstance(data, sklearn.utils.Bunch)
+#	assert set(data.keys()) == set(x5_dataset['keys'])
+#	assert set(data.data.keys()) == set(x5_dataset['data.keys'])
+#	assert data.data.clients.shape == x5_dataset['clients.shape']
+#	assert data.data.train.shape == x5_dataset['train.shape']
+#	assert data.target.shape == x5_dataset['target.shape']
+#	assert data.treatment.shape == x5_dataset['treatment.shape']
 
 
 @pytest.fixture
@@ -85,6 +84,14 @@ def test_fetch_criteo10(
     assert data.target.shape == target_shape
     assert data.treatment.shape == treatment_shape
 
+@pytest.mark.parametrize(
+    'target_col, treatment_col',
+    [('visit','new_trmnt'), ('new_target','treatment')]
+    )    
+def test_fetch_criteo_errors(target_col, treatment_col):
+	with pytest.raises(ValueError):
+		 fetch_criteo(target_col=target_col, treatment_col=treatment_col) 
+
 
 @pytest.fixture
 def hillstrom_dataset() -> dict:
@@ -111,6 +118,10 @@ def test_fetch_hillstrom(
     assert data.target.shape == target_shape
     assert data.treatment.shape == hillstrom_dataset['treatment.shape']
 
+def test_fetch_hillstrom_error():
+	with pytest.raises(ValueError):
+		 fetch_hillstrom(target_col='new_target')   
+
 
 @pytest.fixture
 def megafon_dataset() -> dict:
diff --git a/sklift/tests/test_metrics.py b/sklift/tests/test_metrics.py
@@ -7,10 +7,11 @@
 
 from sklearn.utils._testing import assert_array_almost_equal
 
+from ..metrics import make_uplift_scorer
 from ..metrics import uplift_curve, uplift_auc_score, perfect_uplift_curve
 from ..metrics import qini_curve, qini_auc_score, perfect_qini_curve
 from ..metrics import (uplift_at_k, response_rate_by_percentile,
-                       weighted_average_uplift, uplift_by_percentile, treatment_balance_curve)
+                       weighted_average_uplift, uplift_by_percentile, treatment_balance_curve, average_squared_deviation)
 
 
 def make_predictions(binary):
@@ -221,6 +222,12 @@ def test_perfect_qini_curve_hard():
 
         assert_array_almost_equal(x_actual, np.array([0., 0., 3.]))
         assert_array_almost_equal(y_actual, np.array([0.0, 0.0, 0.0]))
+ 
+def test_perfect_qini_curve_error():
+	y_true, uplift, treatment = make_predictions(binary=True)
+	with pytest.raises(TypeError):
+		perfect_qini_curve(y_true, treatment, negative_effect=5)
+        
 
 
 def test_qini_auc_score():
@@ -255,11 +262,33 @@ def test_qini_auc_score():
         treatment = [1, 0, 1]
         assert_array_almost_equal(qini_auc_score(y_true, uplift, treatment), 0.75)
 
+def test_qini_auc_score_error():
+	y_true = [1, 0]
+	uplift = [0.1, 0.3]
+	treatment = [0, 1]
+	with pytest.raises(TypeError):
+		qini_auc_score(y_true, uplift, treatment, negative_effect=5)        
+
 
 def test_uplift_at_k():
     y_true, uplift, treatment = make_predictions(binary=True)
 
     assert_array_almost_equal(uplift_at_k(y_true, uplift, treatment, strategy='by_group', k=1), np.array([0.]))
+    #assert_array_almost_equal(uplift_at_k(y_true, uplift, treatment, strategy='overall', k=2), np.array([0.]))
+
+@pytest.mark.parametrize(
+    "strategy, k",
+    [
+        ('new_strategy', 1),
+        ('by_group', -0.5),
+        ('by_group', '1'),
+        ('by_group', 2)
+    ]
+)
+def test_uplift_at_k_errors(strategy, k):
+	y_true, uplift, treatment = make_predictions(binary=True)
+	with pytest.raises(ValueError):
+		uplift_at_k(y_true, uplift, treatment, strategy, k)
 
 
 @pytest.mark.parametrize(
@@ -277,6 +306,19 @@ def test_response_rate_by_percentile(strategy, group, response_rate):
     assert_array_almost_equal(response_rate_by_percentile(y_true, uplift, treatment, group, strategy, bins=1),
                               response_rate)
 
+@pytest.mark.parametrize(
+    "strategy, group, bins",
+    [
+        ('new_strategy', 'control', 1),
+        ('by_group', 'ctrl', 1),
+        ('by_group', 'control', 0.5),
+        ('by_group', 'control', 9999)
+    ]
+)
+def test_response_rate_by_percentile_errors(strategy, group, bins):
+    y_true, uplift, treatment = make_predictions(binary=True)
+    with pytest.raises(ValueError):
+    	response_rate_by_percentile(y_true, uplift, treatment, group=group, strategy=strategy, bins=bins)
 
 @pytest.mark.parametrize(
     "strategy, weighted_average",
@@ -289,7 +331,21 @@ def test_weighted_average_uplift(strategy, weighted_average):
     y_true, uplift, treatment = make_predictions(binary=True)
 
     assert_array_almost_equal(weighted_average_uplift(y_true, uplift, treatment, strategy, bins=1), weighted_average)
+    
 
+@pytest.mark.parametrize(
+    "strategy, bins",
+    [
+        ('new_strategy', 1),
+        ('by_group', 0.5),
+        ('by_group', 9999)
+    ]
+)
+def test_weighted_average_uplift_errors(strategy, bins):
+	y_true, uplift, treatment = make_predictions(binary=True)
+	with pytest.raises(ValueError):
+		weighted_average_uplift(y_true, uplift, treatment, strategy=strategy, bins=bins)
+    
 
 @pytest.mark.parametrize(
     "strategy, bins, std, total, string_percentiles, data",
@@ -307,11 +363,68 @@ def test_uplift_by_percentile(strategy, bins, std, total, string_percentiles, da
 
     assert_array_almost_equal(
         uplift_by_percentile(y_true, uplift, treatment, strategy, bins, std, total, string_percentiles), data)
+        
+@pytest.mark.parametrize(
+    "strategy, bins, std, total, string_percentiles",
+    [
+        ('new_strategy', 1, True, True, True),
+        ('by_group', 0.5, True, True, True),
+        ('by_group', 9999, True, True, True),
+        ('by_group', 1, 2, True, True),
+        ('by_group', 1, True, True, 2),
+        ('by_group', 1, True, 2, True)
+    ]
+)
+def test_uplift_by_percentile_errors(strategy, bins, std, total, string_percentiles):
+	y_true, uplift, treatment = make_predictions(binary=True)
+	with pytest.raises(ValueError):
+		uplift_by_percentile(y_true, uplift, treatment, strategy, bins, std, total, string_percentiles)        
 
 
 def test_treatment_balance_curve():
     y_true, uplift, treatment = make_predictions(binary=True)
 
     idx, balance = treatment_balance_curve(uplift, treatment, winsize=2)
     assert_array_almost_equal(idx, np.array([1., 100.]))
-    assert_array_almost_equal(balance, np.array([1., 0.5]))
+    assert_array_almost_equal(balance, np.array([1., 0.5]))
+
+@pytest.mark.parametrize(
+    "strategy",
+    [
+        ('overall'),
+        ('by_group')
+    ]
+)    
+def test_average_squared_deviation(strategy):
+	y_true, uplift, treatment = make_predictions(binary=True)
+	assert (average_squared_deviation(y_true, uplift, treatment, y_true, uplift, treatment, strategy, bins=1) == 0)
+
+@pytest.mark.parametrize(
+    "strategy, bins",
+    [
+        ('new_strategy', 1),
+        ('by_group', 0.5),
+        ('by_group', 9999)
+    ]
+)    
+def test_average_squared_deviation_errors(strategy, bins):
+	y_true, uplift, treatment = make_predictions(binary=True)
+	with pytest.raises(ValueError):
+		average_squared_deviation(y_true, uplift, treatment, y_true, uplift, treatment, strategy=strategy, bins=bins)
+ 	
+def test_metric_name_error():
+	with pytest.raises(ValueError):
+		make_uplift_scorer('new_scorer', [0, 1])
+		
+def test_make_scorer_error():
+	with pytest.raises(TypeError):
+		make_uplift_scorer('qini_auc_score', [])	
+
+    
+ 
+
+	    
+
+
+			
+			  
diff --git a/sklift/tests/test_models.py b/sklift/tests/test_models.py
@@ -1,4 +1,5 @@
 import pytest
+import numpy as np
 from sklearn.linear_model import LogisticRegression, LinearRegression
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
@@ -43,3 +44,50 @@ def test_shape_regression(model, random_xy_dataset_regr):
     assert model.fit(X, y, treat).predict(X).shape[0] == y.shape[0]
     pipe = Pipeline(steps=[("scaler", StandardScaler()), ("clf", model)])
     assert pipe.fit(X, y, clf__treatment=treat).predict(X).shape[0] == y.shape[0]
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        SoloModel(LogisticRegression(), method='dummy'),
+        SoloModel(LogisticRegression(), method='treatment_interaction'),
+    ]
+)    		            	
+def test_solomodel_fit_error(model):
+	X, y, treatment = [[1., 0., 0.],[1., 0., 0.],[1., 0., 0.]], [1., 2., 3.], [0., 1., 0.]
+	with pytest.raises(TypeError):
+		model.fit(X, y, treatment)	
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        SoloModel(LogisticRegression(), method='dummy'),
+        SoloModel(LogisticRegression(), method='treatment_interaction'),
+    ]
+)    		            	
+def test_solomodel_pred_error(model):
+	X_train, y_train, treat_train = (np.array([[5.1, 3.5, 1.4, 0.2], [4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]]),
+                                     np.array([0.0, 0.0, 1.0]), np.array([0.0, 1.0, 1.0]))
+	model.fit(X_train, y_train, treat_train)	
+	with pytest.raises(TypeError):			
+		model.predict(1)		
+		
+@pytest.mark.parametrize("method", ['method'])
+def test_solomodel_method_error(method):
+	with pytest.raises(ValueError):
+		SoloModel(LogisticRegression(), method=method)	
+
+def test_classtransformation_fit_error():
+	X, y, treatment = [[1., 0., 0.],[1., 0., 0.],[1., 0., 0.]], [1., 2., 3.], [0., 1., 0.]
+	with pytest.raises(ValueError):
+		ClassTransformation(LogisticRegression()).fit(X, y, treatment)			
+		
+@pytest.mark.parametrize("method", ['method'])
+def test_twomodels_method_error(method):
+	with pytest.raises(ValueError):
+		TwoModels(LinearRegression(), LinearRegression(), method=method)					
+		
+def test_same_estimator_error():
+	est = LinearRegression()
+	with pytest.raises(ValueError):
+		TwoModels(est, est)
+
diff --git a/sklift/tests/test_viz.py b/sklift/tests/test_viz.py