Merge pull request #15 from ngoix/general

ngoix · web-flow · commit b78ef9d20b92 · 2017-09-25T13:03:42.000+02:00
general review
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -7,8 +7,8 @@ People
 
 .. hlist::
 
-  * `Ronan Gauthier <Ronan.GAUTIER@bpce.fr>`_
   * `Florian Gardin <florian.GARDIN_EXT@bpce.fr>`_
+  * `Ronan Gautier <Ronan.GAUTIER@bpce.fr>`_
   * `Nicolas Goix <Nicolas.GOIX_EXT@bpce.fr>`_
   * `Bibi Ndiaye <Bibi.NDIAYE@bpce.fr>`_
   * `Jean-Matthieu Schertzer <jean-mathieu.SCHERTZER_EXT@bpce.fr>`_
diff --git a/doc/Makefile b/doc/Makefile
@@ -53,7 +53,7 @@ clean:
 	-rm -rf modules/generated/*
 
 html:
-	# These two lines make the build a bit more lengthy, and the
+	# These two lines make the build a bit more lengthy, and
 	# the embedding of images more robust
 	rm -rf $(BUILDDIR)/html/_images
 	#rm -rf _build/doctrees/
diff --git a/doc/index.rst b/doc/index.rst
@@ -12,7 +12,7 @@ rules have to be used for classifying data.
 This project is particularly suitable for supervised anomaly detection,
 i.e. imbalanced classification.
 Application domains include fraud detection, predictive
-maintenance, intrusion detection, churn detection.
+maintenance, intrusion detection, churn detection...
 
 This project comes with a `skrules` module which contains a single
 estimator with unit tests, along with examples and benchmarks.
diff --git a/examples/plot_credit_default.py b/examples/plot_credit_default.py
@@ -5,17 +5,14 @@
 
 
 SkopeRules finds logical rules with high precision and fuse them. Finding
-good rules is done by fitting classification or regression trees
+good rules is done by fitting classification and regression trees
 to sub-samples.
 A fitted tree defines a set of rules (each tree node defines a rule); rules
 are then tested out of the bag, and the ones with higher precision are kept.
-This set of rules is  decision function, reflecting for
-each new samples how many rules have find it abnormal.
 
 This example aims at finding logical rules to predict credit defaults. The
 analysis shows that setting.
 
-The dataset comes from BLABLABLA.
 """
 
 ###############################################################################
@@ -54,8 +51,6 @@
 for col in ['ID']:
     del data[col]
 
-# data = pd.get_dummies(data, columns = ['SEX', 'EDUCATION', 'MARRIAGE'])
-
 # Quick feature engineering
 data = data.rename(columns={"PAY_0": "PAY_1"})
 old_PAY = ['PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
@@ -80,7 +75,7 @@
 
 # Creating the train/test split
 feature_names = list(data.columns)
-print(feature_names)
+print("List of variables used to train models : " + str(feature_names))
 data = data.values
 n_samples = data.shape[0]
 n_samples_train = int(n_samples / 2)
@@ -90,14 +85,11 @@
 X_test = data[n_samples_train:]
 
 ###############################################################################
-# Benchmark with a Decision Tree and Random Forests
+# Benchmark with a Random Forest classifier.
 # ..................
 #
-# This part shows the training and performance evaluation of
-# two tree-based models.
-# The objective remains to extract rules which targets credit defaults.
-# This benchmark shows the performance reached with a decision tree and a
-# random forest.
+# This part shows the training and performance evaluation of a random forest
+# model. The objective remains to extract rules which targets credit defaults.
 
 RF = GridSearchCV(
     RandomForestClassifier(
@@ -106,24 +98,22 @@
         class_weight='balanced'),
     param_grid={
         'max_depth': range(3, 8, 1),
-        'max_features': np.linspace(0.1, 0.2, 1.)
+        'max_features': np.linspace(0.1, 1., 5)
         },
     scoring={'AUC': 'roc_auc'}, cv=5,
     refit='AUC', n_jobs=-1)
 
 RF.fit(X_train, y_train)
 scoring_RF = RF.predict_proba(X_test)[:, 1]
 
-# print("Decision Tree selected parameters : "+str(DT.best_params_))
-print("Random Forest selected parameters : "+str(RF.best_params_))
+print("Random Forest selected parameters : " + str(RF.best_params_))
 
 # Plot ROC and PR curves
 
 fig, axes = plt.subplots(1, 2, figsize=(12, 5),
                          sharex=True, sharey=True)
 
 ax = axes[0]
-# fpr_DT, tpr_DT, _ = roc_curve(y_test, scoring_DT)
 fpr_RF, tpr_RF, _ = roc_curve(y_test, scoring_RF)
 ax.step(fpr_RF, tpr_RF, linestyle='-.', c='g', lw=1, where='post')
 ax.set_title("ROC", fontsize=20)
@@ -132,7 +122,6 @@
 ax.set_ylabel('True Positive Rate (Recall)', fontsize=18)
 
 ax = axes[1]
-# precision_DT, recall_DT, _ = precision_recall_curve(y_test, scoring_DT)
 precision_RF, recall_RF, _ = precision_recall_curve(y_test, scoring_RF)
 ax.step(recall_RF, precision_RF, linestyle='-.', c='g', lw=1, where='post')
 ax.set_title("Precision-Recall", fontsize=20)
@@ -209,8 +198,8 @@
 
 ###############################################################################
 # The ROC and Precision-Recall curves show the performance of the rules
-# generated by SkopeRulesthe (blue points) and the performance of the Random
-# Forest classifier fitted above.
+# generated by SkopeRulesthe (the blue points) and the performance of the
+# Random Forest classifier fitted above.
 # Each blue point represents the performance of a set of rules: The kth point
 # represents the score associated to the concatenation (union) of the k first
 # rules, etc. Thus, each blue point is associated with an interpretable
diff --git a/skrules/datasets/credit_data.py b/skrules/datasets/credit_data.py
@@ -1,3 +1,21 @@
+"""default of credit card clients dataset.
+
+The original database is available from UCI Machine Learning Repository:
+
+    https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
+
+The data contains 30000 observations on 24 variables.
+
+References
+----------
+
+Lichman, M. (2013). UCI Machine Learning Repository
+[http://archive.ics.uci.edu/ml].
+Irvine, CA: University of California, School of Information and Computer
+Science.
+
+"""
+
 import pandas as pd
 import numpy as np
 from sklearn.datasets.base import get_data_home, Bunch
diff --git a/skrules/skope_rules.py b/skrules/skope_rules.py
@@ -22,15 +22,15 @@ class SkopeRules(BaseEstimator):
     Parameters
     ----------
 
-    feature_names: list of str, optional
+    feature_names : list of str, optional
         The names of each feature to be used for returning rules in string
         format.
 
-    precision_min: float, optional (default=0.5)
-        minimal precision of a rule to be selected.
+    precision_min : float, optional (default=0.5)
+        The minimal precision of a rule to be selected.
 
-    recall_min: float, optional (default=0.01)
-        minimal recall of a rule to be selected.
+    recall_min : float, optional (default=0.01)
+        The minimal recall of a rule to be selected.
 
     n_estimators : int, optional (default=10)
         The number of base estimators (rules) to use for prediction. More are
@@ -50,7 +50,8 @@ class SkopeRules(BaseEstimator):
         all samples will be used for all trees (no sampling).
 
     max_samples_features : int or float, optional (default=1.0)
-        The number of features to draw from X to train each decision tree.
+        The number of features to draw from X to train each decision tree, from
+        which rules are generated and selected.
             - If int, then draw `max_features` features.
             - If float, then draw `max_features * X.shape[1]` features.
 
@@ -95,9 +96,9 @@ class SkopeRules(BaseEstimator):
         If -1, then the number of jobs is set to the number of cores.
 
     random_state : int, RandomState instance or None, optional
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
+        - If int, random_state is the seed used by the random number generator.
+        - If RandomState instance, random_state is the random number generator.
+        - If None, the random number generator is the RandomState instance used
         by `np.random`.
 
     verbose : int, optional (default=0)
@@ -442,7 +443,7 @@ def decision_function(self, X):
 
         scores = np.zeros(X.shape[0])
         for (r, w) in selected_rules:
-            scores[list(df.query(r).index)] += 1  # w[0]
+            scores[list(df.query(r).index)] += w[0]
 
         return scores
 

-Original file line number
+Diff line change
 .. hlist::
 -  * `Ronan Gauthier <[email protected]>`_
   * `Florian Gardin <[email protected]>`_
 +  * `Ronan Gautier <[email protected]>`_
   * `Nicolas Goix <[email protected]>`_
   * `Bibi Ndiaye <[email protected]>`_
   * `Jean-Matthieu Schertzer <[email protected]>`_