@@ -251,3 +251,219 @@ def transform(self, X, y):
251
251
return X_resampled , y_resampled , idx_under
252
252
else :
253
253
return X_resampled , y_resampled
254
+
255
+
256
+ class RepeatedEditedNearestNeighbours (UnderSampler ):
257
+ """Class to perform under-sampling based on the repeated edited nearest
258
+ neighbour method.
259
+
260
+ Parameters
261
+ ----------
262
+ return_indices : bool, optional (default=False)
263
+ Either to return or not the indices which will be selected from
264
+ the majority class.
265
+
266
+ random_state : int or None, optional (default=None)
267
+ Seed for random number generation.
268
+
269
+ verbose : bool, optional (default=True)
270
+ Boolean to either or not print information about the processing
271
+
272
+ size_ngh : int, optional (default=3)
273
+ Size of the neighbourhood to consider to compute the average
274
+ distance to the minority point samples.
275
+
276
+ kind_sel : str, optional (default='all')
277
+ Strategy to use in order to exclude samples.
278
+
279
+ - If 'all', all neighbours will have to agree with the samples of
280
+ interest to not be excluded.
281
+ - If 'mode', the majority vote of the neighbours will be used in
282
+ order to exclude a sample.
283
+
284
+ n_jobs : int, optional (default=-1)
285
+ The number of thread to open when it is possible.
286
+
287
+ Attributes
288
+ ----------
289
+ ratio_ : str or float, optional (default='auto')
290
+ If 'auto', the ratio will be defined automatically to balanced
291
+ the dataset. Otherwise, the ratio will corresponds to the number
292
+ of samples in the minority class over the the number of samples
293
+ in the majority class.
294
+
295
+ rs_ : int or None, optional (default=None)
296
+ Seed for random number generation.
297
+
298
+ min_c_ : str or int
299
+ The identifier of the minority class.
300
+
301
+ max_c_ : str or int
302
+ The identifier of the majority class.
303
+
304
+ stats_c_ : dict of str/int : int
305
+ A dictionary in which the number of occurences of each class is
306
+ reported.
307
+
308
+ max_iter : int, optional (default=100)
309
+ Maximum number of iterations of the edited nearest neighbours
310
+ algorithm for a single run.
311
+
312
+ Notes
313
+ -----
314
+ The method is based on [1]_.
315
+
316
+ This class supports multi-class.
317
+
318
+ References
319
+ ----------
320
+ .. [1] I. Tomek, “An Experiment with the Edited Nearest-Neighbor
321
+ Rule,” IEEE Trans. Systems, Man, and Cybernetics, vol. 6, no. 6,
322
+ pp. 448-452, June 1976.
323
+
324
+ """
325
+
326
+ def __init__ (self , return_indices = False , random_state = None , verbose = True ,
327
+ size_ngh = 3 , max_iter = 100 , kind_sel = 'all' , n_jobs = - 1 ):
328
+ """Initialisation of RENN object.
329
+
330
+ Parameters
331
+ ----------
332
+ return_indices : bool, optional (default=False)
333
+ Either to return or not the indices which will be selected from
334
+ the majority class.
335
+
336
+ random_state : int or None, optional (default=None)
337
+ Seed for random number generation.
338
+
339
+ verbose : bool, optional (default=True)
340
+ Boolean to either or not print information about the processing
341
+
342
+ size_ngh : int, optional (default=3)
343
+ Size of the neighbourhood to consider to compute the average
344
+ distance to the minority point samples.
345
+
346
+ max_iter : int, optional (default=100)
347
+ Maximum number of iterations of the edited nearest neighbours
348
+ algorithm for a single run.
349
+
350
+ kind_sel : str, optional (default='all')
351
+ Strategy to use in order to exclude samples.
352
+
353
+ - If 'all', all neighbours will have to agree with the samples of
354
+ interest to not be excluded.
355
+ - If 'mode', the majority vote of the neighbours will be used in
356
+ order to exclude a sample.
357
+
358
+ n_jobs : int, optional (default=-1)
359
+ The number of thread to open when it is possible.
360
+
361
+ Returns
362
+ -------
363
+ None
364
+
365
+ """
366
+ super (RepeatedEditedNearestNeighbours , self ).__init__ (
367
+ return_indices = return_indices ,
368
+ random_state = random_state ,
369
+ verbose = verbose )
370
+
371
+ self .size_ngh = size_ngh
372
+ possible_kind_sel = ('all' , 'mode' )
373
+ if kind_sel not in possible_kind_sel :
374
+ raise NotImplementedError
375
+ else :
376
+ self .kind_sel = kind_sel
377
+ self .n_jobs = n_jobs
378
+
379
+ if max_iter < 2 :
380
+ raise ValueError ('max_iter must be greater than 1.' )
381
+ else :
382
+ self .max_iter = max_iter
383
+
384
+ self .enn_ = EditedNearestNeighbours (
385
+ return_indices = return_indices ,
386
+ random_state = random_state , verbose = False ,
387
+ size_ngh = size_ngh , kind_sel = kind_sel ,
388
+ n_jobs = n_jobs )
389
+
390
+ def fit (self , X , y ):
391
+ """Find the classes statistics before to perform sampling.
392
+
393
+ Parameters
394
+ ----------
395
+ X : ndarray, shape (n_samples, n_features)
396
+ Matrix containing the data which have to be sampled.
397
+
398
+ y : ndarray, shape (n_samples, )
399
+ Corresponding label for each sample in X.
400
+
401
+ Returns
402
+ -------
403
+ self : object,
404
+ Return self.
405
+
406
+ """
407
+ # Check the consistency of X and y
408
+ X , y = check_X_y (X , y )
409
+
410
+ super (RepeatedEditedNearestNeighbours , self ).fit (X , y )
411
+ self .enn_ .fit (X , y )
412
+
413
+ return self
414
+
415
+ def transform (self , X , y ):
416
+ """Resample the dataset.
417
+
418
+ Parameters
419
+ ----------
420
+ X : ndarray, shape (n_samples, n_features)
421
+ Matrix containing the data which have to be sampled.
422
+
423
+ y : ndarray, shape (n_samples, )
424
+ Corresponding label for each sample in X.
425
+
426
+ Returns
427
+ -------
428
+ X_resampled : ndarray, shape (n_samples_new, n_features)
429
+ The array containing the resampled data.
430
+
431
+ y_resampled : ndarray, shape (n_samples_new)
432
+ The corresponding label of `X_resampled`
433
+
434
+ idx_under : ndarray, shape (n_samples, )
435
+ If `return_indices` is `True`, a boolean array will be returned
436
+ containing the which samples have been selected.
437
+
438
+ """
439
+ # Check the consistency of X and y
440
+ X , y = check_X_y (X , y )
441
+ X_ , y_ = X .copy (), y .copy ()
442
+
443
+ if self .return_indices :
444
+ idx_under = np .arange (X .shape [0 ], dtype = int )
445
+
446
+ prev_len = y .shape [0 ]
447
+
448
+ for n_iter in range (self .max_iter ):
449
+ prev_len = y_ .shape [0 ]
450
+ if self .return_indices :
451
+ X_ , y_ , idx_ = self .enn_ .transform (X_ , y_ )
452
+ idx_under = idx_under [idx_ ]
453
+ else :
454
+ X_ , y_ = self .enn_ .transform (X_ , y_ )
455
+
456
+ if prev_len == y_ .shape [0 ]:
457
+ break
458
+
459
+ if self .verbose :
460
+ print ("Under-sampling performed: {}" .format (Counter (y_ )))
461
+
462
+ X_resampled , y_resampled = X_ , y_
463
+
464
+ # Check if the indices of the samples selected should be returned too
465
+ if self .return_indices :
466
+ # Return the indices of interest
467
+ return X_resampled , y_resampled , idx_under
468
+ else :
469
+ return X_resampled , y_resampled
0 commit comments