Merge pull request #119 from TwsThomas/fix_examples

GaelVaroquaux · web-flow · commit 57b0ab30d9e3 · 2020-05-15T09:21:21.000+02:00
[WIP] fix bug in examples
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,3 +1,11 @@
+Release 0.0.7
+=============
+* **datasets.fetch_employee_salaries**: change the origin of download for employee_salaries.
+  - The function now return a bunch with a dataframe under the field "data",
+    and not the path to the csv file. 
+  - The field "description" has been renamed to "DESCR".
+
+
 Release 0.0.6
 =============
 * **SimilarityEncoder**: Fixed a bug when using the Jaro-Winkler distance as a
diff --git a/dirty_cat/datasets/fetching.py b/dirty_cat/datasets/fetching.py
@@ -13,6 +13,7 @@
 from collections import namedtuple
 import contextlib
 import warnings
+from sklearn.datasets import fetch_openml
 
 from ..datasets.utils import md5_hash, _check_if_exists, \
     _uncompress_file, \
@@ -346,18 +347,23 @@ def fetch_employee_salaries():
     dict
         a dictionary containing:
 
-            - a short description of the dataset (under the ``description``
+            - a short description of the dataset (under the ``DESCR``
               key)
-            - an absolute path leading to the csv file where the data is stored
-              locally (under the ``path`` key)
+            - the tabular data (under the ``data`` key)
+            - the target (under the ``target`` key)
 
     References
     ----------
     https://catalog.data.gov/dataset/employee-salaries-2016
 
     """
 
-    return fetch_dataset(EMPLOYEE_SALARIES_CONFIG, show_progress=False)
+    data = fetch_openml(data_id=42125, as_frame=True)
+    data.data['Current Annual Salary'] = data['target']
+    return data
+
+    # link dead.
+    # return fetch_dataset(EMPLOYEE_SALARIES_CONFIG, show_progress=False)
 
 
 def fetch_road_safety():
diff --git a/examples/01_investigating_dirty_categories.py b/examples/01_investigating_dirty_categories.py
@@ -15,8 +15,8 @@
 from dirty_cat import datasets
 
 employee_salaries = datasets.fetch_employee_salaries()
-print(employee_salaries['description'])
-data = pd.read_csv(employee_salaries['path'])
+print(employee_salaries['DESCR'])
+data = employee_salaries['data']
 print(data.head(n=5))
 
 #########################################################################
@@ -25,7 +25,7 @@
 
 #########################################################################
 # As we can see, some entries have many different unique values:
-print(data['Employee Position Title'].value_counts().sort_index())
+print(data['employee_position_title'].value_counts().sort_index())
 
 #########################################################################
 # These different entries are often variations on the same entities:
@@ -47,7 +47,7 @@
 # To simplify understanding, we will focus on the column describing the
 # employee's position title:
 # data
-values = data[['Employee Position Title', 'Gender', 'Current Annual Salary']]
+values = data[['employee_position_title', 'gender', 'Current Annual Salary']]
 
 #########################################################################
 # String similarity between entries
@@ -56,7 +56,7 @@
 # That's where our encoders get into play. In order to robustly
 # embed dirty semantic data, the SimilarityEncoder creates a similarity
 # matrix based on the 3-gram structure of the data.
-sorted_values = values['Employee Position Title'].sort_values().unique()
+sorted_values = values['employee_position_title'].sort_values().unique()
 
 from dirty_cat import SimilarityEncoder
 
@@ -142,7 +142,7 @@
 
 # encoding simply a subset of the observations
 n_obs = 20
-employee_position_titles = values['Employee Position Title'].head(
+employee_position_titles = values['employee_position_title'].head(
     n_obs).to_frame()
 categorical_encoder = OneHotEncoder(sparse=False)
 one_hot_encoded = categorical_encoder.fit_transform(employee_position_titles)
diff --git a/examples/02_fit_predict_plot_employee_salaries.py b/examples/02_fit_predict_plot_employee_salaries.py
@@ -21,18 +21,17 @@
 # We first download the dataset:
 from dirty_cat.datasets import fetch_employee_salaries
 employee_salaries = fetch_employee_salaries()
-print(employee_salaries['description'])
+print(employee_salaries['DESCR'])
+
 
 ################################################################################
 # Then we load it:
 import pandas as pd
-df = pd.read_csv(employee_salaries['path']).astype(str)
+df = employee_salaries['data']
 
 ################################################################################
 # Now, let's carry out some basic preprocessing:
-df['Current Annual Salary'] = df['Current Annual Salary'].str.strip('$').astype(
-    float)
-df['Date First Hired'] = pd.to_datetime(df['Date First Hired'])
+df['Date First Hired'] = pd.to_datetime(df['date_first_hired'])
 df['Year First Hired'] = df['Date First Hired'].apply(lambda x: x.year)
 
 target_column = 'Current Annual Salary'
@@ -45,17 +44,17 @@
 # use one hot encoding to transform them:
 
 clean_columns = {
-    'Gender': 'one-hot',
-    'Department Name': 'one-hot',
-    'Assignment Category': 'one-hot',
+    'gender': 'one-hot',
+    'department_name': 'one-hot',
+    'assignment_category': 'one-hot',
     'Year First Hired': 'numerical'}
 
 #########################################################################
 # We then choose the categorical encoding methods we want to benchmark
 # and the dirty categorical variable:
 
 encoding_methods = ['one-hot', 'target', 'similarity']
-dirty_column = 'Employee Position Title'
+dirty_column = 'employee_position_title'
 #########################################################################
 
 
diff --git a/examples/04_dimension_reduction_and_performance.py b/examples/04_dimension_reduction_and_performance.py
@@ -34,7 +34,7 @@ def wrapped_func(*args, **kwargs):
                                                 max_usage=True,
                                                 retval=True)
         print("Run time: %.1is    Memory used: %iMb"
-              % (time() - t0, mem[0]))
+              % (time() - t0, mem))
         return out
 
     return wrapped_func