Merge pull request #210 from dpr1005/development

dpuenteramirez · web-flow · commit 76f952602999 · 2022-05-09T11:03:46.000+02:00
Improved code quality
diff --git a/.deepsource.toml b/.deepsource.toml
@@ -1,8 +1,8 @@
 version = 1
 
 test_patterns = [
-  "test/**",
-  "test_*"
+  "\"tests/**\",",
+  "\"test_*.py\","
 ]
 
 exclude_patterns = [
diff --git a/instance_selection/_CNN.py b/instance_selection/_CNN.py
@@ -79,7 +79,7 @@ def filter(self, samples, y):
                     indexes.append(index)
                     store_not_modified = True
             delete_multiple_element(handbag, indexes)
-        del handbag
+
         samples = pd.DataFrame(store, columns=self.x_attr)
         y = pd.DataFrame(
             np.array(store_classes, dtype=object).flatten().astype(int))
diff --git a/instance_selection/_ENN.py b/instance_selection/_ENN.py
@@ -47,11 +47,11 @@ def __init__(self, nearest_neighbors=3, power_parameter=2):
 
     def _neighs(self, s_samples, s_targets, index, removed):
         """
-        _neighs() takes in the samples and targets, the index of the sample to
-        be removed, and the number of samples already removed. It returns the
-        sample to be removed, its target, the targets of the samples not yet
-        removed, the samples not yet removed, and the indices of the nearest
-        neighbors of the sample to be removed.
+        The function takes in the samples and targets, the index of the
+        sample to be removed, and the number of samples already removed. It
+        returns the sample to be removed, its target, the targets of the
+        samples not yet removed, the samples not yet removed, and the
+        indices of the nearest neighbors of the sample to be removed.
 
         :param s_samples: the samples that are being used to train the model
         :param s_targets: the targets of the samples
diff --git a/instance_selection/_LocalSets.py b/instance_selection/_LocalSets.py
@@ -88,6 +88,19 @@ def _find_enemy_distance(
         label,
         labels,
     ):
+        """
+        It finds the closest enemy sample to the current sample
+
+        :param closest_enemy_distance: the distance to the closest enemy sample
+        :param closest_enemy_sample: the index of the closest enemy sample
+        :param distances: the distance matrix
+        :param index: the index of the current sample
+        :param instances: the data
+        :param label: the label of the current sample
+        :param labels: the labels of the samples
+        :return: The closest enemy distance and the index of the closest enemy
+        sample.
+        """
         for index2, (_, label2) in enumerate(zip(instances, labels)):
             if index == index2 or label == label2:
                 continue
@@ -146,10 +159,6 @@ class LSSm(LocalSets):
 
     """
 
-    def __init__(self):
-        """A constructor for the class."""
-        super().__init__()
-
     def filter(self, instances, labels):
         """
         The function takes in a dataframe of instances and a dataframe of
@@ -200,10 +209,6 @@ class LSBo(LocalSets):
 
     """
 
-    def __init__(self):
-        """A constructor for the class."""
-        super(LSBo, self).__init__()
-
     def filter(self, instances, labels):
         """
         > The function takes in a dataframe of instances and a dataframe of
diff --git a/instance_selection/_MSS.py b/instance_selection/_MSS.py
@@ -83,17 +83,15 @@ def _enemy_distance(dat, tar):
         :return: A list of lists, where each list contains a sample, its class,
          and its distance to its nearest enemy.
         """
-
         solution = []
         for sample, x_class in zip(dat, tar):
             distance = sys.maxsize
             for sample_1, x1_class in zip(dat, tar):
                 if x1_class == x_class:
                     continue
-                else:
-                    euc = np.linalg.norm(sample - sample_1)
-                    if euc < distance:
-                        distance = euc
+                euc = np.linalg.norm(sample - sample_1)
+                if euc < distance:
+                    distance = euc
             solution.append([sample, x_class, distance])
 
         solution.sort(key=lambda x: x[2])
diff --git a/semisupervised/DemocraticCoLearning.py b/semisupervised/DemocraticCoLearning.py
@@ -395,7 +395,7 @@ def predict(self, samples):
         confidence = [0 for _ in range(self.n_labels)]
         for index, j in enumerate(gj):
             izq = (j + 0.5) / (j + 1)
-            div = True if j != 0 else False
+            div = j != 0
             if div:
                 der = [
                     (gj_h[0][index] * self.w1) / gj[index],
diff --git a/semisupervised/DensityPeaks.py b/semisupervised/DensityPeaks.py
@@ -70,6 +70,22 @@ def __init__(
         else:
             self.filter = None
 
+        self.y = None
+        self.low = None
+        self.u = None
+        self.classifier_stdpnf = None
+        self.order = None
+        self.structure = None
+        self.structure_stdnpf = None
+        self.n_id = None
+        self.distances = None
+        self.max_dis = None
+        self.min_dis = None
+        self.rho = None
+        self.delta = None
+        self.nneigh = None
+        self.data = None
+
     def __build_distance(self):
         """
         Calculate distance dict.
@@ -182,9 +198,13 @@ def __min_neighbor_and_distance(self):
 
         :return: distance vector, nearest neighbor vector
         """
+        if self.rho is None:
+            raise ValueError("Encountered rho as None.")
+
         sort_rho_idx = np.argsort(-self.rho)
         delta, nneigh = [float(self.max_dis)] * self.n_id, [0] * self.n_id
         delta[sort_rho_idx[0]] = -1.0
+
         for i in range(self.n_id):
             for j in range(0, i):
                 old_i, old_j = sort_rho_idx[i], sort_rho_idx[j]
@@ -240,6 +260,7 @@ def __step_a(self):
         return samples_labeled
 
     def __discover_structure(self):
+        """Discovers the under laying structure."""
         self._fit_without()
 
     def __nan_search(self):
@@ -343,22 +364,22 @@ def __enane(self, fx, nan, r):
 
         return es, es_pred
 
-    def __init_values(self, l, u, y):
+    def __init_values(self, low, u, y):
         """
         It takes in the lower and upper bounds of the data, and the data itself,
          and then calculates the distances between the data points,
          the maximum distance, the minimum distance, the dc value, the rho
          value, the delta value, the number of neighbors, and the structure
          of the data
 
-        :param l: lower bound of the data
+        :param low: lower bound of the data
         :param u: upper bound of the data
         :param y: the labels of the data
         """
         self.y = y
-        self.l = l
+        self.low = low
         self.u = u
-        self.data = np.concatenate((l, u), axis=0)
+        self.data = np.concatenate((low, u), axis=0)
         self.n_id = self.data.shape[0]
         self.distances, self.max_dis, self.min_dis = self.__build_distance()
         self.dc = self.__select_dc()
@@ -447,14 +468,13 @@ def _fit_stdpnf(self):
         Self Training based on Density Peaks and a parameter-free noise
         filter.
         """
-
         self.__discover_structure()
 
         nan, lambda_param = self.__nan_search()
         self.classifier_stdpnf = KNeighborsClassifier(
             n_neighbors=self.k, metric=self.distance_metric
         )
-        self.classifier_stdpnf.fit(self.l, self.y)
+        self.classifier_stdpnf.fit(self.low, self.y)
         count = 1
 
         while count <= max(self.order.values()):
@@ -530,7 +550,7 @@ def _if_filter(self, complete, complete_y):
         :return: The result is a dataframe with the filtered data.
         """
         if isinstance(self.filter, ENN):
-            original = pd.DataFrame(self.l)
+            original = pd.DataFrame(self.low)
             original_y = pd.DataFrame(self.y)
             result, _ = self.filter.filter_original_complete(
                 original, original_y, complete, complete_y
diff --git a/semisupervised/TriTraining.py b/semisupervised/TriTraining.py
@@ -18,13 +18,6 @@
 from .utils import split
 
 
-def measure_error(classifier_j, classifier_k, labeled_data):
-    pred_j = classifier_j.predict(labeled_data)
-    pred_k = classifier_k.predict(labeled_data)
-    same = len([0 for x, y in zip(pred_j, pred_k) if x == y])
-    return (len(pred_j) - same) / same
-
-
 class TriTraining:
     """
     Zhou, Z. H., & Li, M. (2005). Tri-training: Exploiting unlabeled data
@@ -244,7 +237,7 @@ def _train_classifier(self, ep_k, h_i, h_j, h_k, labeled, lp_k, u):
         """
         update_k = False
         l_k = Bunch(data=np.array([]), target=np.array([]))
-        e_k = measure_error(h_j, h_k, labeled)
+        e_k = self.measure_error(h_j, h_k, labeled)
         if e_k < ep_k:
             for sample in u:
                 sample_s = sample.reshape(1, -1)
@@ -286,3 +279,20 @@ def predict(self, samples):
             labels.append(np.where(count == np.amax(count))[0][0])
 
         return np.array(labels)
+
+    @staticmethod
+    def measure_error(classifier_j, classifier_k, labeled_data):
+        """
+        It returns the fraction of the time that classifiers j and k disagree on
+         the labels of the labeled data
+
+        :param classifier_j: the classifier you want to compare to
+        :param classifier_k: the classifier that we want to measure the error of
+        :param labeled_data: the labeled data that we're using to train the
+        classifiers
+        :return: The error rate of the two classifiers.
+        """
+        pred_j = classifier_j.predict(labeled_data)
+        pred_k = classifier_k.predict(labeled_data)
+        same = len([0 for x, y in zip(pred_j, pred_k) if x == y])
+        return (len(pred_j) - same) / same
diff --git a/semisupervised/ensemble/_RESSEL.py b/semisupervised/ensemble/_RESSEL.py
@@ -80,7 +80,6 @@ def fit(self, labeled, unlabeled, base_estimator, estimator_params=None):
         :param estimator_params: dict of params to pass to the estimator.
         :return: the ensemble in case is needed.
         """
-
         self._validate_params(base_estimator, labeled, unlabeled)
 
         self._init_ensemble(base_estimator, estimator_params)
@@ -218,7 +217,6 @@ def _robust_self_training(self, iteration, l_i, u_i, oob_i, d_class_i):
         :param d_class_i: the proportion of samples to be selected from each
         class
         """
-
         y_pred = self.ensemble[iteration].predict(oob_i.iloc[:, :-1])
         best_error_i = f1_score(
             y_true=np.ravel(oob_i.iloc[:, -1:]),
diff --git a/tests/test_InstanceSelection.py b/tests/test_InstanceSelection.py
@@ -23,6 +23,7 @@ def to_dataframe(y):
     """
     if not isinstance(y, pd.DataFrame):
         return pd.DataFrame(y)
+    return y
 
 
 @pytest.fixture
diff --git a/utils/arff2dataset.py b/utils/arff2dataset.py
@@ -20,6 +20,8 @@ def arff_data(dataset_path, attr=False):
     defaults to False (optional)
     :return: A bunch object with the data, target and attributes.
     """
+    if ".arff" not in str(dataset_path).lower():
+        raise ValueError("File does not an ARFF extension.")
     file = open(dataset_path, "r")
     attrs, data = _read_file(file)
     file.close()
@@ -33,8 +35,7 @@ def arff_data(dataset_path, attr=False):
 
     if not attr:
         return Bunch(data=data, target=labels)
-    else:
-        return Bunch(data=data, target=labels, attr=attrs)
+    return Bunch(data=data, target=labels, attr=attrs)
 
 
 def _read_file(file):
diff --git a/utils/dir.py b/utils/dir.py
@@ -8,6 +8,11 @@
 
 
 def check_dir(path):
+    """
+    If the path doesn't exist, create it
+
+    :param path: the path to the folder where the ranks solutions will be saved
+    """
     if not os.path.isdir(path):
         os.mkdir(path)
         if os.path.isdir(path):

Original file line number	Diff line number	Diff line change
`@@ -1,8 +1,8 @@`
`1`	`1`	`version = 1`
`2`	`2`
`3`	`3`	`test_patterns = [`
`4`		`- "test/**",`
`5`		`- "test_*"`
	`4`	`+ "\"tests/**\",",`
	`5`	`+ "\"test_*.py\","`
`6`	`6`	`]`
`7`	`7`
`8`	`8`	`exclude_patterns = [`