Source code for imblearn.combine._smote_enn

"""Class to perform over-sampling using SMOTE and cleaning using ENN."""

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
#          Christos Aridas
# License: MIT

from __future__ import division

from sklearn.base import clone
from sklearn.utils import check_X_y

from ..base import BaseSampler
from ..over_sampling import SMOTE
from ..over_sampling.base import BaseOverSampler
from ..under_sampling import EditedNearestNeighbours
from ..utils import check_target_type
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring


[docs]@Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring) class SMOTEENN(BaseSampler): """Class to perform over-sampling using SMOTE and cleaning using ENN. Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. Read more in the :ref:`User Guide <combine>`. Parameters ---------- {sampling_strategy} {random_state} smote : object, optional (default=SMOTE()) The :class:`imblearn.over_sampling.SMOTE` object to use. If not given, a :class:`imblearn.over_sampling.SMOTE` object with default parameters will be given. ratio : str, dict, or callable .. deprecated:: 0.4 Use the parameter ``sampling_strategy`` instead. It will be removed in 0.6. Notes ----- The method is presented in [1]_. Supports multi-class resampling. Refer to SMOTE and ENN regarding the scheme which used. See also -------- SMOTETomek : Over-sample using SMOTE followed by under-sampling removing the Tomek's links. References ---------- .. [1] G. Batista, R. C. Prati, M. C. Monard. "A study of the behavior of several methods for balancing machine learning training data," ACM Sigkdd Explorations Newsletter 6 (1), 20-29, 2004. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.combine import SMOTEENN # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sme = SMOTEENN(random_state=42) >>> X_res, y_res = sme.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 881}}) """ _sampling_type = 'over-sampling'
[docs] def __init__(self, sampling_strategy='auto', random_state=None, smote=None, enn=None, ratio=None): super(SMOTEENN, self).__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.enn = enn self.ratio = ratio
def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" if self.smote is not None: if isinstance(self.smote, SMOTE): self.smote_ = clone(self.smote) else: raise ValueError('smote needs to be a SMOTE object.' 'Got {} instead.'.format(type(self.smote))) # Otherwise create a default SMOTE else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, ratio=self.ratio) if self.enn is not None: if isinstance(self.enn, EditedNearestNeighbours): self.enn_ = clone(self.enn) else: raise ValueError('enn needs to be an EditedNearestNeighbours.' ' Got {} instead.'.format(type(self.enn))) # Otherwise create a default EditedNearestNeighbours else: self.enn_ = EditedNearestNeighbours(sampling_strategy='all') def _fit_resample(self, X, y): self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.sampling_strategy_ = self.sampling_strategy X_res, y_res = self.smote_.fit_resample(X, y) return self.enn_.fit_resample(X_res, y_res)