Source code for imblearn.combine._smote_tomek

"""Class to perform over-sampling using SMOTE and cleaning using Tomek
links."""

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
#          Christos Aridas
# License: MIT

from __future__ import division

from sklearn.base import clone
from sklearn.utils import check_X_y

from ..base import BaseSampler
from ..over_sampling import SMOTE
from ..over_sampling.base import BaseOverSampler
from ..under_sampling import TomekLinks
from ..utils import check_target_type
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring


[docs]@Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring) class SMOTETomek(BaseSampler): """Class to perform over-sampling using SMOTE and cleaning using Tomek links. Combine over- and under-sampling using SMOTE and Tomek links. Read more in the :ref:`User Guide <combine>`. Parameters ---------- {sampling_strategy} {random_state} smote : object, optional (default=SMOTE()) The :class:`imblearn.over_sampling.SMOTE` object to use. If not given, a :class:`imblearn.over_sampling.SMOTE` object with default parameters will be given. tomek : object, optional (default=Tomek()) The :class:`imblearn.under_sampling.Tomek` object to use. If not given, a :class:`imblearn.under_sampling.Tomek` object with default parameters will be given. ratio : str, dict, or callable .. deprecated:: 0.4 Use the parameter ``sampling_strategy`` instead. It will be removed in 0.6. Notes ----- The methos is presented in [1]_. Supports multi-class resampling. Refer to SMOTE and TomekLinks regarding the scheme which used. See also -------- SMOTEENN : Over-sample using SMOTE followed by under-sampling using Edited Nearest Neighbours. References ---------- .. [1] G. Batista, B. Bazzan, M. Monard, "Balancing Training Data for Automated Annotation of Keywords: a Case Study," In WOB, 10-18, 2003. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.combine import \ SMOTETomek # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> smt = SMOTETomek(random_state=42) >>> X_res, y_res = smt.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ _sampling_type = 'over-sampling'
[docs] def __init__(self, sampling_strategy='auto', random_state=None, smote=None, tomek=None, ratio=None): super(SMOTETomek, self).__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.tomek = tomek self.ratio = ratio
def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" if self.smote is not None: if isinstance(self.smote, SMOTE): self.smote_ = clone(self.smote) else: raise ValueError('smote needs to be a SMOTE object.' 'Got {} instead.'.format(type(self.smote))) # Otherwise create a default SMOTE else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, ratio=self.ratio) if self.tomek is not None: if isinstance(self.tomek, TomekLinks): self.tomek_ = clone(self.tomek) else: raise ValueError('tomek needs to be a TomekLinks object.' 'Got {} instead.'.format(type(self.tomek))) # Otherwise create a default TomekLinks else: self.tomek_ = TomekLinks(sampling_strategy='all') def _fit_resample(self, X, y): self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.sampling_strategy_ = self.sampling_strategy X_res, y_res = self.smote_.fit_resample(X, y) return self.tomek_.fit_resample(X_res, y_res)