maks-sh
diff --git a/‎docs/api/datasets/create_data_dir.rst‎
Lines changed: 0 additions & 5 deletions b/‎docs/api/datasets/create_data_dir.rst‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎docs/api/datasets/download.rst‎
Lines changed: 0 additions & 5 deletions b/‎docs/api/datasets/download.rst‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎docs/api/datasets/fetch_criteo.rst‎
Lines changed: 5 additions & 3 deletions b/‎docs/api/datasets/fetch_criteo.rst‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎docs/api/datasets/fetch_hillstorm.rst‎
Lines changed: 0 additions & 5 deletions b/‎docs/api/datasets/fetch_hillstorm.rst‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎docs/api/datasets/fetch_hillstrom.rst‎
Lines changed: 7 additions & 0 deletions b/‎docs/api/datasets/fetch_hillstrom.rst‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/api/datasets/fetch_lenta.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/api/datasets/fetch_lenta.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/api/datasets/fetch_x5.rst‎
Lines changed: 3 additions & 1 deletion b/‎docs/api/datasets/fetch_x5.rst‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/api/datasets/get_data.rst‎
Lines changed: 0 additions & 5 deletions b/‎docs/api/datasets/get_data.rst‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎docs/api/datasets/index.rst‎
Lines changed: 1 addition & 4 deletions b/‎docs/api/datasets/index.rst‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎sklift/datasets/datasets.py‎
Lines changed: 103 additions & 54 deletions b/‎sklift/datasets/datasets.py‎
Lines changed: 103 additions & 54 deletions
@@ -1,5 +1,7 @@
-***********************************
+**************************************
 `sklift.datasets <./>`_.fetch_criteo
-***********************************
+**************************************
 
-.. autofunction:: sklift.datasets.datasets.fetch_criteo
+.. autofunction:: sklift.datasets.datasets.fetch_criteo
+
+.. include:: ../../../sklift/datasets/descr/criteo.rst
@@ -0,0 +1,7 @@
+****************************************
+`sklift.datasets <./>`_.fetch_hillstrom
+****************************************
+
+.. autofunction:: sklift.datasets.datasets.fetch_hillstrom
+
+.. include:: ../../../sklift/datasets/descr/lenta.rst
@@ -3,4 +3,5 @@
 ***********************************
 
 .. autofunction:: sklift.datasets.datasets.fetch_lenta
+
 .. include:: ../../../sklift/datasets/descr/lenta.rst
@@ -2,4 +2,6 @@
 `sklift.datasets <./>`_.fetch_x5
 ***********************************
 
-.. autofunction:: sklift.datasets.datasets.fetch_x5
+.. autofunction:: sklift.datasets.datasets.fetch_x5
+
+.. include:: ../../../sklift/datasets/descr/x5.rst
@@ -6,11 +6,8 @@
    :maxdepth: 3
 
    ./clear_data_dir
-   ./create_data_dir
-   ./download
    ./get_data_dir
-   ./get_data
    ./fetch_lenta
    ./fetch_x5
    ./fetch_criteo
-   ./fetch_hillstorm
+   ./fetch_hillstrom
@@ -15,7 +15,7 @@ def get_data_dir():
     return os.path.join(os.path.expanduser("~"), "scikit-uplift-data")
 
 
-def create_data_dir(path):
+def _create_data_dir(path):
     """This function creates a directory, which stores the datasets.
 
     Args:
@@ -26,7 +26,7 @@ def create_data_dir(path):
         os.makedirs(path)
 
 
-def download(url, dest_path):
+def _download(url, dest_path):
     '''Download the file from url and save it localy
     
     Args:
@@ -47,7 +47,7 @@ def download(url, dest_path):
         raise TypeError("URL must be a string")
 
 
-def get_data(data_home, url, dest_subdir, dest_filename, download_if_missing):
+def _get_data(data_home, url, dest_subdir, dest_filename, download_if_missing):
     """Return the path to the dataset.
     
     Args:
@@ -71,13 +71,13 @@ def get_data(data_home, url, dest_subdir, dest_filename, download_if_missing):
         else:
             data_dir = os.path.join(os.path.abspath(data_home), dest_subdir)
 
-    create_data_dir(data_dir)
+    _create_data_dir(data_dir)
 
     dest_path = os.path.join(data_dir, dest_filename)
 
     if not os.path.isfile(dest_path):
         if download_if_missing:
-            download(url, dest_path)
+            _download(url, dest_path)
         else:
             raise IOError("Dataset missing")
     return dest_path
@@ -95,15 +95,16 @@ def clear_data_dir(path=None):
         shutil.rmtree(path, ignore_errors=True)
 
 
-def fetch_lenta(return_X_y_t=False, data_home=None, dest_subdir=None, download_if_missing=True):
-    '''Fetch the Lenta dataset.
+def fetch_lenta(data_home=None, dest_subdir=None, download_if_missing=True, return_X_y_t=False, as_frame=False):
+    """Fetch the Lenta dataset.
 
         Args:
-            return_X_y_t (bool): If True, returns (data, target, treatment) instead of a Bunch object. 
-                See below for more information about the data and target object.
             data_home (str, unicode): The path to the folder where datasets are stored.
             dest_subdir (str, unicode): The name of the folder in which the dataset is stored.
             download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing.
+            return_X_y_t (bool): If True, returns (data, target, treatment) instead of a Bunch object.
+                                 See below for more information about the data and target object.
+            as_frame (bool):
 
         Returns:
             * dataset ('~sklearn.utils.Bunch'): Dictionary-like object, with the following attributes.
@@ -113,69 +114,101 @@ def fetch_lenta(return_X_y_t=False, data_home=None, dest_subdir=None, download_i
                 * DESCR (str): Description of the Lenta dataset.
 
             * (data,target,treatment): tuple if 'return_X_y_t' is True.
-    '''
-    url='https:/winterschool123.s3.eu-north-1.amazonaws.com/lentadataset.csv.gz'
-    filename='lentadataset.csv.gz'
-    csv_path=get_data(data_home=data_home, url=url, dest_subdir=dest_subdir,
-             dest_filename=filename,
-            download_if_missing=download_if_missing)
+    """
+
+    url = 'https://winterschool123.s3.eu-north-1.amazonaws.com/lentadataset.csv.gz'
+    filename = 'lentadataset.csv.gz'
+    csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir,
+                        dest_filename=filename,
+                        download_if_missing=download_if_missing)
     data = pd.read_csv(csv_path)
-    target=data['response_att']
-    treatment=data['group']
-    data=data.drop(['response_att', 'group'], axis=1)
+    if as_frame:
+        target=data['response_att']
+        treatment=data['group']
+        data=data.drop(['response_att', 'group'], axis=1)
+        feature_names = list(data.columns)
+    else:
+        target = data[['response_att']].to_numpy()
+        treatment = data[['group']].to_numpy()
+        data = data.drop(['response_att', 'group'], axis=1)
+        feature_names = list(data.columns)
+        data = data.to_numpy()
 
     module_path = os.path.dirname(__file__)
     with open(os.path.join(module_path, 'descr', 'lenta.rst')) as rst_file:
         fdescr = rst_file.read()
 
-    if return_X_y_t == True:
+    if return_X_y_t:
         return data, target, treatment
 
-    return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr)
+    return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr,
+                 feature_names=feature_names, target_name='response_att', treatment_name='group')
 
 
-def fetch_x5(data_home=None, dest_subdir=None, download_if_missing=True):
+def fetch_x5(data_home=None, dest_subdir=None, download_if_missing=True, as_frame=False):
     """Fetch the X5 dataset.
 
-        Args:
-            '~sklearn.utils.Bunch': dataset
+    Args:
+        data_home (string): Specify a download and cache folder for the datasets.
+        dest_subdir (string, unicode): The name of the folder in which the dataset is stored.
+        download_if_missing (bool, default=True): If False, raise an IOError if the data is not locally available
+                                                  instead of trying to download the data from the source site.
+        as_frame (bool, default=False):
+
+    Returns:
+        '~sklearn.utils.Bunch': dataset
                 Dictionary-like object, with the following attributes.
-            data ('~sklearn.utils.Bunch'): Dataset without target and treatment.
-            target (Series object): Column target by values
-            treatment (Series object): Column treatment by values
-            DESCR (str): Description of the X5 dataset.
-            train (DataFrame object): Dataset with target and treatment.
+        data ('~sklearn.utils.Bunch'): Dataset without target and treatment.
+        target (Series object): Column target by values
+        treatment (Series object): Column treatment by values
+        DESCR (str): Description of the X5 dataset.
+        train (DataFrame object): Dataset with target and treatment.
+        data_names ('~sklearn.utils.Bunch'): Names of features.
+        treatment_name (string): The name of the treatment column.
     """
     url_clients = 'https://timds.s3.eu-central-1.amazonaws.com/clients.csv.gz'
     file_clients = 'clients.csv.gz'
-    csv_clients_path = get_data(data_home=data_home, url=url_clients, dest_subdir=dest_subdir,
+    csv_clients_path = _get_data(data_home=data_home, url=url_clients, dest_subdir=dest_subdir,
                                 dest_filename=file_clients,
                                 download_if_missing=download_if_missing)
     clients = pd.read_csv(csv_clients_path)
+    clients_names = list(clients.column)
 
     url_train = 'https://timds.s3.eu-central-1.amazonaws.com/uplift_train.csv.gz'
     file_train = 'uplift_train.csv.gz'
-    csv_train_path = get_data(data_home=data_home, url=url_train, dest_subdir=dest_subdir,
+    csv_train_path = _get_data(data_home=data_home, url=url_train, dest_subdir=dest_subdir,
                               dest_filename=file_train,
                               download_if_missing=download_if_missing)
     train = pd.read_csv(csv_train_path)
+    train_names = list(train.columns)
 
     url_purchases = 'https://timds.s3.eu-central-1.amazonaws.com/purchases.csv.gz'
     file_purchases = 'purchases.csv.gz'
-    csv_purchases_path = get_data(data_home=data_home, url=url_purchases, dest_subdir=dest_subdir,
+    csv_purchases_path = _get_data(data_home=data_home, url=url_purchases, dest_subdir=dest_subdir,
                                 dest_filename=file_purchases,
                                 download_if_missing=download_if_missing)
     purchases = pd.read_csv(csv_purchases_path)
+    purchases_names = list(purchases.columns)
 
-    target = train['target']
-    treatment = train['treatment_flg']
+    if as_frame:
+        target = train['target']
+        treatment = train['treatment_flg']
+    else:
+        target = train[['target']].to_numpy()
+        treatment = train[['treatment_flg']].to_numpy()
+        train = train.to_numpy()
+        clients = clients.to_numpy()
+        purchases = purchases.to_numpy()
 
     module_path = os.path.dirname(__file__)
     with open(os.path.join(module_path, 'descr', 'x5.rst')) as rst_file:
         fdescr = rst_file.read()
 
     return Bunch(data=Bunch(clients=clients, train=train, purchases=purchases), 
-                 target=target, treatment=treatment, DESCR=fdescr)
+                 target=target, treatment=treatment, DESCR=fdescr,
+                 data_names=Bunch(clients_names=clients_names, train_names=train_names,
+                                  purchases_names=purchases_names),
+                 treatment_name='treatment_flg')
 
 
 def fetch_criteo(data_home=None, dest_subdir=None, download_if_missing=True, percent10=True,
@@ -209,14 +242,14 @@ def fetch_criteo(data_home=None, dest_subdir=None, download_if_missing=True, per
     """
     if percent10:
         url = 'https://criteo-bucket.s3.eu-central-1.amazonaws.com/criteo10.csv.gz'
-        csv_path = get_data(data_home=data_home, url=url, dest_subdir=dest_subdir,
+        csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir,
                             dest_filename='criteo10.csv.gz',
                             download_if_missing=download_if_missing)
     else:
         url = "https://criteo-bucket.s3.eu-central-1.amazonaws.com/criteo.csv.gz"
-        csv_path = get_data(data_home=data_home, url=url, dest_subdir=dest_subdir,
-                        dest_filename='criteo.csv.gz',
-                        download_if_missing=download_if_missing)
+        csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir,
+                            dest_filename='criteo.csv.gz',
+                            download_if_missing=download_if_missing)
 
     if treatment_feature == 'exposure':
         data = pd.read_csv(csv_path, usecols=[i for i in range(12)])
@@ -264,22 +297,21 @@ def fetch_criteo(data_home=None, dest_subdir=None, download_if_missing=True, per
                          feature_names=feature_names, target_name=target_name, treatment_name=treatment_name)
 
 
-def fetch_hillstrom(target='visit',
-                    data_home=None,
-                    dest_subdir=None,
-                    download_if_missing=True,
-                    return_X_y=False):
+def fetch_hillstrom(data_home=None, dest_subdir=None, download_if_missing=True, target_column='visit',
+                    return_X_y_t=False, as_frame=False):
     """Load the hillstrom dataset.
     
         Args:
-          target : str, desfault=visit. 
-              Can also be conversion, and spend
           data_home : str, default=None
               Specify another download and cache folder for the datasets.
           dest_subdir : str, default=None
           download_if_missing : bool, default=True
               If False, raise a IOError if the data is not locally available
               instead of trying to download the data from the source site.
+          target_column (string, 'visit' or 'conversion' or 'spend', default='visit'): Selects which column from dataset
+                                                                                       will be target
+          return_X_y_t (bool):
+          as_frame (bool):
         
         Returns:
           Dictionary-like object, with the following attributes.
@@ -288,24 +320,41 @@ def fetch_hillstrom(target='visit',
           target : {ndarray, series} of shape (64000,)
             The regression target for each sample. 
           treatment : {ndarray, series} of shape (64000,)
+          feature_names (list): The names of the future columns
+          target_name (string): The name of the target column.
+          treatment_name (string): The name of the treatment column
     """
 
     url = 'https://hillstorm1.s3.us-east-2.amazonaws.com/hillstorm_no_indices.csv.gz'
-    csv_path = get_data(data_home=data_home,
+    csv_path = _get_data(data_home=data_home,
                         url=url,
                         dest_subdir=dest_subdir,
                         dest_filename='hillstorm_no_indices.csv.gz',
                         download_if_missing=download_if_missing)
-    hillstrom = pd.read_csv(csv_path)
-    hillstrom_data = hillstrom.drop(columns=['segment', target])
+
+    if target_column != ('visit' or 'conversion' or 'spend'):
+        raise ValueError(f"Target_column value must be from {['visit', 'conversion', 'spend']}. "
+                         f"Got value {target_column}.")
+
+    data = pd.read_csv(csv_path, usecols=[i for i in range(8)])
+    feature_names = list(data.columns)
+    treatment = pd.read_csv(csv_path, usecols=['segment'])
+    target = pd.read_csv(csv_path, usecols=[target_column])
+    if as_frame:
+        target = target[target_column]
+        treatment = treatment['segment']
+    else:
+        data = data.to_numpy()
+        target = target.to_numpy()
+        treatment = treatment.to_numpy()
 
     module_path = os.path.dirname('__file__')
     with open(os.path.join(module_path, 'descr', 'hillstrom.rst')) as rst_file:
         fdescr = rst_file.read()
 
-    if return_X_y:
-        return treatment, data, target
-    
-    return Bunch(treatment=hillstrom['segment'],
-                 target=hillstrom[target],
-                 data=hillstrom_data, DESCR=fdescr)
+    if return_X_y_t:
+        return data, target, treatment
+    else:
+        target_name = target_column
+        return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr,
+                     feature_names=feature_names, target_name=target_name, treatment_name='segment')