maks-sh
diff --git a/‎docs/conf.py‎
Lines changed: 4 additions & 1 deletion b/‎docs/conf.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/requirements.txt‎
Lines changed: 2 additions & 1 deletion b/‎docs/requirements.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎sklift/datasets/datasets.py‎
Lines changed: 123 additions & 73 deletions b/‎sklift/datasets/datasets.py‎
Lines changed: 123 additions & 73 deletions
diff --git a/‎sklift/datasets/descr/criteo.rst‎
Lines changed: 9 additions & 8 deletions b/‎sklift/datasets/descr/criteo.rst‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎sklift/datasets/descr/hillstrom.rst‎
Lines changed: 10 additions & 8 deletions b/‎sklift/datasets/descr/hillstrom.rst‎
Lines changed: 10 additions & 8 deletions
@@ -51,9 +51,12 @@ def get_version():
     "sphinx.ext.mathjax",
     "sphinx.ext.napoleon",
     "recommonmark",
-    "sphinx.ext.intersphinx"
+    "sphinx.ext.intersphinx",
+    "sphinxcontrib.bibtex"
 ]
 
+bibtex_bibfiles = ['refs.bib']
+
 master_doc = 'index'
 
 # Add any paths that contain templates here, relative to this directory.
 
@@ -1,3 +1,4 @@
 sphinx-autobuild
 sphinx_rtd_theme
-recommonmark
+recommonmark
+sphinxcontrib-bibtex
@@ -6,36 +6,38 @@
 
 
 def get_data_dir():
-    """This function returns a directory, which stores the datasets.
+    """Return the path of the scikit-uplift data dir.
+
+    This folder is used by some large dataset loaders to avoid downloading the data several times.
+
+    By default the data dir is set to a folder named ‘scikit_learn_data’ in the user home folder.
 
     Returns:
-        Full path to a directory, which stores the datasets.
+        string: The path to scikit-uplift data dir.
 
     """
     return os.path.join(os.path.expanduser("~"), "scikit-uplift-data")
 
 
 def _create_data_dir(path):
-    """This function creates a directory, which stores the datasets.
+    """Creates a directory, which stores the datasets.
 
     Args:
-        path (str): The path to the folder where datasets are stored.
+        path (str): The path to scikit-uplift data dir.
 
     """
     if not os.path.isdir(path):
         os.makedirs(path)
 
 
 def _download(url, dest_path):
-    '''Download the file from url and save it localy
-    
+    """Download the file from url and save it locally.
+
     Args:
         url: URL address, must be a string.
         dest_path: Destination of the file.
 
-    Returns:
-        TypeError if URL is not a string.
-    '''
+    """
     if isinstance(url, str):
         req = requests.get(url, stream=True)
         req.raise_for_status()
@@ -51,14 +53,16 @@ def _get_data(data_home, url, dest_subdir, dest_filename, download_if_missing):
     """Return the path to the dataset.
     
     Args:
-        data_home (str, unicode): The path to the folder where datasets are stored.
+        data_home (str, unicode): The path to scikit-uplift data dir.
         url (str or unicode): The URL to the dataset.
         dest_subdir (str or unicode): The name of the folder in which the dataset is stored.
         dest_filename (str): The name of the dataset.
-        download_if_missing (bool): Flag if dataset is missing.
+        download_if_missing (bool): If False, raise a IOError if the data is not locally available instead of
+            trying to download the data from the source site.
 
     Returns:
-        The path to the dataset.
+        string: The path to the dataset.
+
     """
     if data_home is None:
         if dest_subdir is None:
@@ -84,43 +88,59 @@ def _get_data(data_home, url, dest_subdir, dest_filename, download_if_missing):
 
 
 def clear_data_dir(path=None):
-    """This function deletes the file.
+    """Delete all the content of the data home cache.
 
         Args:
-            path (str): File path. By default, this is the default path for datasets.
-        """
+            path (str): The path to scikit-uplift data dir
+
+    """
     if path is None:
         path = get_data_dir()
     if os.path.isdir(path):
         shutil.rmtree(path, ignore_errors=True)
 
 
-def fetch_lenta(data_home=None, dest_subdir=None, download_if_missing=True, return_X_y_t=False, as_frame=False):
-    """Fetch the Lenta dataset.
 
-        Args:
-            data_home (str, unicode): The path to the folder where datasets are stored.
-            dest_subdir (str, unicode): The name of the folder in which the dataset is stored.
-            download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing.
-            return_X_y_t (bool): If True, returns (data, target, treatment) instead of a Bunch object.
-                                 See below for more information about the data and target object.
-            as_frame (bool):
-
-        Returns:
-            * dataset ('~sklearn.utils.Bunch'): Dictionary-like object, with the following attributes.
-                * data (DataFrame object): Dataset without target and treatment.
-                * target (Series object): Column target by values.
-                * treatment (Series object): Column treatment by values.
-                * DESCR (str): Description of the Lenta dataset.
-
-            * (data,target,treatment): tuple if 'return_X_y_t' is True.
+def fetch_lenta(return_X_y_t=False, data_home=None, dest_subdir=None, download_if_missing=True):
+    """Load and return the Lenta dataset (classification).
+
+    An uplift modeling dataset containing data about Lenta's customers grociery shopping and related marketing campaigns.
+
+    Major columns:
+
+    - ``group`` (str): treatment/control group flag
+    - ``response_att`` (binary): target
+    - ``gender`` (str): customer gender
+    - ``age`` (float): customer age
+    - ``main_format`` (int): store type (1 - grociery store, 0 - superstore)
+
+    Args:
+        return_X_y_t (bool): If True, returns (data, target, treatment) instead of a Bunch object.
+        See below for more information about the data and target object.
+        data_home (str, unicode): The path to the folder where datasets are stored.
+        dest_subdir (str, unicode): The name of the folder in which the dataset is stored.
+        download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing.
+
+    Returns:
+        Bunch or tuple: dataset.
+
+            By default dictionary-like object, with the following attributes:
+
+                * ``data`` (DataFrame object): Dataset without target and treatment.
+                * ``target`` (Series object): Column target by values.
+                * ``treatment`` (Series object): Column treatment by values.
+                * ``DESCR`` (str): Description of the Lenta dataset.
+
+        tuple (data, target, treatment) if `return_X_y` is True
     """
 
-    url = 'https://winterschool123.s3.eu-north-1.amazonaws.com/lentadataset.csv.gz'
-    filename = 'lentadataset.csv.gz'
-    csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir,
-                        dest_filename=filename,
-                        download_if_missing=download_if_missing)
+    url='https:/winterschool123.s3.eu-north-1.amazonaws.com/lentadataset.csv.gz'
+    filename='lentadataset.csv.gz'
+
+    csv_path=_get_data(data_home=data_home, url=url, dest_subdir=dest_subdir,
+             dest_filename=filename,
+            download_if_missing=download_if_missing)
+
     data = pd.read_csv(csv_path)
     if as_frame:
         target=data['response_att']
@@ -145,27 +165,33 @@ def fetch_lenta(data_home=None, dest_subdir=None, download_if_missing=True, retu
                  feature_names=feature_names, target_name='response_att', treatment_name='group')
 
 
-def fetch_x5(data_home=None, dest_subdir=None, download_if_missing=True, as_frame=False):
-    """Fetch the X5 dataset.
+def fetch_x5(data_home=None, dest_subdir=None, download_if_missing=True):
+    """Load the X5 dataset.
+
+    The dataset contains raw retail customer purchaces, raw information about products and general info about customers.
+
+    Major columns:
+
+    - ``treatment_flg`` (binary): treatment/control group flag
+    - ``target`` (binary): target
+    - ``customer_id`` (str): customer id aka primary key for joining
 
     Args:
-        data_home (string): Specify a download and cache folder for the datasets.
-        dest_subdir (string, unicode): The name of the folder in which the dataset is stored.
-        download_if_missing (bool, default=True): If False, raise an IOError if the data is not locally available
-                                                  instead of trying to download the data from the source site.
-        as_frame (bool, default=False):
+        data_home (str, unicode): The path to the folder where datasets are stored.
+        dest_subdir (str, unicode): The name of the folder in which the dataset is stored.
+        download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing.
 
     Returns:
-        '~sklearn.utils.Bunch': dataset
-                Dictionary-like object, with the following attributes.
-        data ('~sklearn.utils.Bunch'): Dataset without target and treatment.
-        target (Series object): Column target by values
-        treatment (Series object): Column treatment by values
-        DESCR (str): Description of the X5 dataset.
-        train (DataFrame object): Dataset with target and treatment.
-        data_names ('~sklearn.utils.Bunch'): Names of features.
-        treatment_name (string): The name of the treatment column.
+        Bunch: dataset Dictionary-like object, with the following attributes.
+        
+            * data ('~sklearn.utils.Bunch'): Dataset without target and treatment.
+            * target (Series object): Column target by values
+            * treatment (Series object): Column treatment by values
+            * DESCR (str): Description of the X5 dataset.
+            * train (DataFrame object): Dataset with target and treatment.
+
     """
+
     url_clients = 'https://timds.s3.eu-central-1.amazonaws.com/clients.csv.gz'
     file_clients = 'clients.csv.gz'
     csv_clients_path = _get_data(data_home=data_home, url=url_clients, dest_subdir=dest_subdir,
@@ -213,8 +239,19 @@ def fetch_x5(data_home=None, dest_subdir=None, download_if_missing=True, as_fram
 
 def fetch_criteo(data_home=None, dest_subdir=None, download_if_missing=True, percent10=True,
                  treatment_feature='treatment', target_column='visit', return_X_y_t=False,  as_frame=False):
-    """Load data from the Criteo dataset
-    
+    """Load data from the Criteo dataset.
+
+    This dataset is constructed by assembling data resulting from several incrementality tests, a particular randomized
+    trial procedure where a random part of the population is prevented from being targeted by advertising.
+
+    Major columns:
+
+    * ``treatment`` (binary): treatment
+    * ``exposure`` (binary): treatment
+    * ``visit`` (binary): target
+    * ``conversion`` (binary): target
+    * ``f0, ... , f11`` (float): feature values
+
     Args:
         data_home (string): Specify a download and cache folder for the datasets.
         dest_subdir (string, unicode): The name of the folder in which the dataset is stored.
@@ -227,7 +264,8 @@ def fetch_criteo(data_home=None, dest_subdir=None, download_if_missing=True, per
                                                                           will be target
         return_X_y_t (bool, default=False): If True, returns (data, target, treatment) instead of a Bunch object.
                 See below for more information about the data and target object.
-        as_frame (bool, default=False):
+        as_frame (bool, default=False): If True, return as pandas.Series
+
     Returns:
         ''~sklearn.utils.Bunch'': dataset
             Dictionary-like object, with the following attributes.
@@ -300,29 +338,41 @@ def fetch_criteo(data_home=None, dest_subdir=None, download_if_missing=True, per
 def fetch_hillstrom(data_home=None, dest_subdir=None, download_if_missing=True, target_column='visit',
                     return_X_y_t=False, as_frame=False):
     """Load the hillstrom dataset.
+
+    This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test.
+
+    Major columns:
+
+    * ``Visit`` (binary): target. 1/0 indicator, 1 = Customer visited website in the following two weeks.
+    * ``Conversion`` (binary): target. 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks.
+    * ``Spend`` (float): target. Actual dollars spent in the following two weeks.
+    * ``Segment`` (str): treatment. The e-mail campaign the customer received
     
-        Args:
-          data_home : str, default=None
-              Specify another download and cache folder for the datasets.
-          dest_subdir : str, default=None
-          download_if_missing : bool, default=True
-              If False, raise a IOError if the data is not locally available
-              instead of trying to download the data from the source site.
+    Args:
+        target : str, desfault=visit.
+            Can also be conversion, and spend
+        data_home : str, default=None
+            Specify another download and cache folder for the datasets.
+        dest_subdir : str, default=None
+        download_if_missing : bool, default=True
+            If False, raise a IOError if the data is not locally available
+            instead of trying to download the data from the source site.
           target_column (string, 'visit' or 'conversion' or 'spend', default='visit'): Selects which column from dataset
-                                                                                       will be target
+            will be target
           return_X_y_t (bool):
           as_frame (bool):
         
-        Returns:
-          Dictionary-like object, with the following attributes.
-          data : {ndarray, dataframe} of shape (64000, 12)
+    Returns:
+        Dictionary-like object, with the following attributes.
+        data : {ndarray, dataframe} of shape (64000, 12)
             The data matrix to learn. 
-          target : {ndarray, series} of shape (64000,)
+        target : {ndarray, series} of shape (64000,)
             The regression target for each sample. 
-          treatment : {ndarray, series} of shape (64000,)
-          feature_names (list): The names of the future columns
-          target_name (string): The name of the target column.
-          treatment_name (string): The name of the treatment column
+        treatment : {ndarray, series} of shape (64000,)
+        feature_names (list): The names of the future columns
+        target_name (string): The name of the target column.
+        treatment_name (string): The name of the treatment column
+
     """
 
     url = 'https://hillstorm1.s3.us-east-2.amazonaws.com/hillstorm_no_indices.csv.gz'
 
@@ -1,24 +1,27 @@
 Criteo Uplift Modeling Dataset
 ================================
-This is a copy of `Criteo AI Lab Uplift Prediction dataset <https://ailab.criteo.com/criteo-uplift-prediction-dataset/>`_. 
+This is a copy of `Criteo AI Lab Uplift Prediction dataset <https://ailab.criteo.com/criteo-uplift-prediction-dataset/>`_.
 
 Data description
------------------
+################
+
 This dataset is constructed by assembling data resulting from several incrementality tests, a particular randomized trial procedure where a random part of the population is prevented from being targeted by advertising.
 
+
 Fields
----------
+################
 
 Here is a detailed description of the fields (they are comma-separated in the file):
 
 * **f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11**: feature values (dense, float)
-* **treatment**: treatment group (1 = treated, 0 = control)
+* **treatment**: treatment group. Flag if a company participates in the RTB auction for a particular user (binary: 1 = treated, 0 = control)
+* **exposure**: treatment effect, whether the user has been effectively exposed. Flag if a company wins in the RTB auction for the user (binary)
 * **conversion**: whether a conversion occured for this user (binary, label)
 * **visit**: whether a visit occured for this user (binary, label)
-* **exposure**: treatment effect, whether the user has been effectively exposed (binary)
+
 
 Key figures
---------------
+################
 * Format: CSV
 * Size: 297M (compressed) 3,2GB (uncompressed)
 * Rows: 13,979,592
@@ -36,5 +39,3 @@ This work was published in: `AdKDD 2018  <https://adkdd-targetad.wixsite.com/201
 
 
 
-
-
 
@@ -1,15 +1,15 @@
-Kevin Hillstrom: MineThatData
-===============================
-Helping CEOs Understand How Customers Interact With Advertising, Products, Brands, and Channels
-------------
-**March 20, 2008**
+Kevin Hillstrom Dataset: MineThatData
+=====================================
 
-The MineThatData E-Mail Analytics And Data Mining Challenge
-It is time to find a few smart individuals in the world of e-mail analytics and data mining! And honestly, what follows is a dataset that you can manipulate using Excel pivot tables, so you don't have to be a data mining wizard, just be clever!
+Data description
+################
 
-[Here is a link to the MineThatData E-Mail Analytics And Data Mining Challenge dataset:]( http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv) The dataset is in .csv format, and is about the size of a typical mp3 file. I recommend saving the file to disk, then open the file (read only' in the software tool of your choice.
+This is a copy of `MineThatData E-Mail Analytics And Data Mining Challenge dataset <https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html/>`_.
+
+date: March 20, 2008
 
 This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test.
+
 * 1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise.
 * 1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise.
 * 1/3 were randomly chosen to not receive an e-mail campaign.
@@ -28,12 +28,14 @@ Historical customer attributes at your disposal include:
 * Channel: Describes the channels the customer purchased from in the past year.
 
 Another variable describes the e-mail campaign the customer received:
+
 * Segment
 * Mens E-Mail
 * Womens E-Mail
 * No E-Mail
 
 Finally, we have a series of variables describing activity in the two weeks following delivery of the e-mail campaign:
+
 * Visit: 1/0 indicator, 1 = Customer visited website in the following two weeks.
 * Conversion: 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks.
 * Spend: Actual dollars spent in the following two weeks.