Source code for imblearn.ensemble._balance_cascade

"""Class to perform under-sampling using balace cascade."""

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
#          Christos Aridas
# License: MIT

from collections import Counter

import numpy as np

from sklearn.base import ClassifierMixin, clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import check_random_state, safe_indexing
from sklearn.model_selection import cross_val_predict
from sklearn.utils.deprecation import deprecated

from .base import BaseEnsembleSampler
from ..under_sampling.base import BaseUnderSampler
from ..utils import check_sampling_strategy
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring


[docs]@Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring) @deprecated('BalanceCascade is deprecated in 0.4 and will be removed in 0.6.') class BalanceCascade(BaseEnsembleSampler): """Create an ensemble of balanced sets by iteratively under-sampling the imbalanced dataset using an estimator. This method iteratively select subset and make an ensemble of the different sets. The selection is performed using a specific classifier. Parameters ---------- {sampling_strategy} return_indices : bool, optional (default=True) Whether or not to return the indices of the samples randomly selected from the majority class. {random_state} n_max_subset : int or None, optional (default=None) Maximum number of subsets to generate. By default, all data from the training will be selected that could lead to a large number of subsets. We can probably deduce this number empirically. estimator : object, optional (default=KNeighborsClassifier()) An estimator inherited from :class:`sklearn.base.ClassifierMixin` and having an attribute :func:`predict_proba`. bootstrap : bool, optional (default=True) Whether to bootstrap the data before each iteration. ratio : str, dict, or callable .. deprecated:: 0.4 Use the parameter ``sampling_strategy`` instead. It will be removed in 0.6. Notes ----- The method is described in [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. See also -------- BalancedBaggingClassifier, EasyEnsemble References ---------- .. [1] X. Y. Liu, J. Wu and Z. H. Zhou, "Exploratory Undersampling for Class-Imbalance Learning," in IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550, April 2009. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.ensemble import \ BalanceCascade # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> bc = BalanceCascade(random_state=42) >>> X_res, y_res = bc.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res[0])) \ # doctest: +ELLIPSIS Resampled dataset shape Counter({{...}}) """ def __init__(self, sampling_strategy='auto', return_indices=False, random_state=None, n_max_subset=None, estimator=None, ratio=None): super(BalanceCascade, self).__init__( sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.estimator = estimator self.n_max_subset = n_max_subset def _validate_estimator(self): """Private function to create the classifier""" if (self.estimator is not None and isinstance(self.estimator, ClassifierMixin) and hasattr(self.estimator, 'predict')): self.estimator_ = clone(self.estimator) elif self.estimator is None: self.estimator_ = KNeighborsClassifier() else: raise ValueError('Invalid parameter `estimator`. Got {}.'.format( type(self.estimator))) def _fit_resample(self, X, y): self._validate_estimator() self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, 'under-sampling') random_state = check_random_state(self.random_state) # array to know which samples are available to be taken samples_mask = np.ones(y.shape, dtype=bool) # where the different set will be stored idx_under = [] n_subsets = 0 b_subset_search = True while b_subset_search: target_stats = Counter( safe_indexing(y, np.flatnonzero(samples_mask))) # store the index of the data to under-sample index_under_sample = np.empty((0, ), dtype=np.int) # value which will be picked at each round index_constant = np.empty((0, ), dtype=np.int) for target_class in target_stats.keys(): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] # extract the data of interest for this round from the # current class index_class = np.flatnonzero(y == target_class) index_class_interest = index_class[samples_mask[ y == target_class]] y_class = safe_indexing(y, index_class_interest) # select randomly the desired features index_target_class = random_state.choice( range(y_class.size), size=n_samples, replace=False) index_under_sample = np.concatenate( (index_under_sample, index_class_interest[index_target_class]), axis=0) else: index_constant = np.concatenate( (index_constant, np.flatnonzero(y == target_class)), axis=0) # store the set created n_subsets += 1 subset_indices = np.concatenate( (index_under_sample, index_constant), axis=0) idx_under.append(subset_indices) # fit and predict using cross validation X_subset = safe_indexing(X, subset_indices) y_subset = safe_indexing(y, subset_indices) pred = cross_val_predict(self.estimator_, X_subset, y_subset, cv=3) # extract the prediction about the targeted classes only pred_target = pred[:index_under_sample.size] index_classified = index_under_sample[pred_target == safe_indexing( y_subset, range(index_under_sample.size))] samples_mask[index_classified] = False # check the stopping criterion if self.n_max_subset is not None: if n_subsets == self.n_max_subset: b_subset_search = False # check that there is enough samples for another round target_stats = Counter( safe_indexing(y, np.flatnonzero(samples_mask))) for target_class in self.sampling_strategy_.keys(): if (target_stats[target_class] < self.sampling_strategy_[target_class]): b_subset_search = False X_resampled, y_resampled = [], [] for indices in idx_under: X_resampled.append(safe_indexing(X, indices)) y_resampled.append(safe_indexing(y, indices)) if self.return_indices: return (np.array(X_resampled), np.array(y_resampled), np.array(idx_under)) else: return np.array(X_resampled), np.array(y_resampled)