"""Class to perform under-sampling using easy ensemble."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# Christos Aridas
# License: MIT
import numbers
import numpy as np
from sklearn.base import clone
from sklearn.utils import check_random_state
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.utils.deprecation import deprecated
from .base import BaseEnsembleSampler
from ..under_sampling import RandomUnderSampler
from ..under_sampling.base import BaseUnderSampler
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring
from ..pipeline import Pipeline
MAX_INT = np.iinfo(np.int32).max
[docs]@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
random_state=_random_state_docstring)
@deprecated('EasyEnsemble is deprecated in 0.4 and will be removed in 0.6. '
'Use EasyEnsembleClassifier instead.')
class EasyEnsemble(BaseEnsembleSampler):
"""Create an ensemble sets by iteratively applying random under-sampling.
This method iteratively select a random subset and make an ensemble of the
different sets.
.. deprecated:: 0.4
``EasyEnsemble`` is deprecated in 0.4 and will be removed in 0.6. Use
``EasyEnsembleClassifier`` instead.
Parameters
----------
{sampling_strategy}
return_indices : bool, optional (default=False)
Whether or not to return the indices of the samples randomly
selected from the majority class.
{random_state}
replacement : bool, optional (default=False)
Whether or not to sample randomly with replacement or not.
n_subsets : int, optional (default=10)
Number of subsets to generate.
ratio : str, dict, or callable
.. deprecated:: 0.4
Use the parameter ``sampling_strategy`` instead. It will be removed
in 0.6.
Notes
-----
The method is described in [1]_.
Supports multi-class resampling by sampling each class independently.
See also
--------
BalanceCascade, BalancedBaggingClassifier
References
----------
.. [1] X. Y. Liu, J. Wu and Z. H. Zhou, "Exploratory Undersampling for
Class-Imbalance Learning," in IEEE Transactions on Systems, Man, and
Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550,
April 2009.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from imblearn.ensemble import \
EasyEnsemble # doctest: +NORMALIZE_WHITESPACE
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> ee = EasyEnsemble(random_state=42) # doctest: +SKIP
>>> X_res, y_res = ee.fit_resample(X, y) # doctest: +SKIP
>>> print('Resampled dataset shape %s' % Counter(y_res[0])) # doctest: +SKIP
Resampled dataset shape Counter({{0: 100, 1: 100}})
"""
def __init__(self,
sampling_strategy='auto',
return_indices=False,
random_state=None,
replacement=False,
n_subsets=10,
ratio=None):
super(EasyEnsemble, self).__init__(
sampling_strategy=sampling_strategy, ratio=ratio)
self.random_state = random_state
self.return_indices = return_indices
self.replacement = replacement
self.n_subsets = n_subsets
def _fit_resample(self, X, y):
random_state = check_random_state(self.random_state)
X_resampled = []
y_resampled = []
if self.return_indices:
idx_under = []
for _ in range(self.n_subsets):
rus = RandomUnderSampler(
sampling_strategy=self.sampling_strategy_,
random_state=random_state.randint(MAX_INT),
replacement=self.replacement)
sel_x, sel_y = rus.fit_resample(X, y)
X_resampled.append(sel_x)
y_resampled.append(sel_y)
if self.return_indices:
idx_under.append(rus.sample_indices_)
if self.return_indices:
return (np.array(X_resampled), np.array(y_resampled),
np.array(idx_under))
else:
return np.array(X_resampled), np.array(y_resampled)
[docs]@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
random_state=_random_state_docstring)
class EasyEnsembleClassifier(BaggingClassifier):
"""Bag of balanced boosted learners also known as EasyEnsemble.
This algorithm is known as EasyEnsemble [1]_. The classifier is an
ensemble of AdaBoost learners trained on different balanced boostrap
samples. The balancing is achieved by random under-sampling.
Read more in the :ref:`User Guide <boosting>`.
Parameters
----------
n_estimators : int, optional (default=10)
Number of AdaBoost learners in the ensemble.
base_estimator : object, optional (default=AdaBoostClassifier())
The base AdaBoost classifier used in the inner ensemble. Note that you
can set the number of inner learner by passing your own instance.
warm_start : bool, optional (default=False)
When set to True, reuse the solution of the previous call to fit
and add more estimators to the ensemble, otherwise, just fit
a whole new ensemble.
{sampling_strategy}
replacement : bool, optional (default=False)
Whether or not to sample randomly with replacement or not.
n_jobs : int, optional (default=1)
The number of jobs to run in parallel for both `fit` and `predict`.
If -1, then the number of jobs is set to the number of cores.
{random_state}
verbose : int, optional (default=0)
Controls the verbosity of the building process.
Attributes
----------
base_estimator_ : estimator
The base estimator from which the ensemble is grown.
estimators_ : list of estimators
The collection of fitted base estimators.
classes_ : array, shape (n_classes,)
The classes labels.
n_classes_ : int or list
The number of classes.
Notes
-----
The method is described in [1]_.
Supports multi-class resampling by sampling each class independently.
See also
--------
BalancedBaggingClassifier, BalancedRandomForestClassifier
References
----------
.. [1] X. Y. Liu, J. Wu and Z. H. Zhou, "Exploratory Undersampling for
Class-Imbalance Learning," in IEEE Transactions on Systems, Man, and
Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550,
April 2009.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.metrics import confusion_matrix
>>> from imblearn.ensemble import \
EasyEnsembleClassifier # doctest: +NORMALIZE_WHITESPACE
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> X_train, X_test, y_train, y_test = train_test_split(X, y,
... random_state=0)
>>> eec = EasyEnsembleClassifier(random_state=42)
>>> eec.fit(X_train, y_train) # doctest: +ELLIPSIS
EasyEnsembleClassifier(...)
>>> y_pred = eec.predict(X_test)
>>> print(confusion_matrix(y_test, y_pred))
[[ 23 0]
[ 2 225]]
"""
[docs] def __init__(self, n_estimators=10, base_estimator=None, warm_start=False,
sampling_strategy='auto', replacement=False, n_jobs=1,
random_state=None, verbose=0):
super(EasyEnsembleClassifier, self).__init__(
base_estimator,
n_estimators=n_estimators,
max_samples=1.0,
max_features=1.0,
bootstrap=False,
bootstrap_features=False,
oob_score=False,
warm_start=warm_start,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose)
self.sampling_strategy = sampling_strategy
self.replacement = replacement
def _validate_estimator(self, default=AdaBoostClassifier()):
"""Check the estimator and the n_estimator attribute, set the
`base_estimator_` attribute."""
if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
raise ValueError("n_estimators must be an integer, "
"got {0}.".format(type(self.n_estimators)))
if self.n_estimators <= 0:
raise ValueError("n_estimators must be greater than zero, "
"got {0}.".format(self.n_estimators))
if self.base_estimator is not None:
base_estimator = clone(self.base_estimator)
else:
base_estimator = clone(default)
self.base_estimator_ = Pipeline(
[('sampler', RandomUnderSampler(
sampling_strategy=self.sampling_strategy,
replacement=self.replacement)),
('classifier', base_estimator)])
[docs] def fit(self, X, y):
"""Build a Bagging ensemble of AdaBoost classifier using balanced
boostrasp with random under-sampling.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values.
Returns
-------
self : object
Returns self.
"""
# RandomUnderSampler is not supporting sample_weight. We need to pass
# None.
return self._fit(X, y, self.max_samples, sample_weight=None)