"""Class to perform over-sampling using SMOTE and cleaning using Tomek
links."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# Christos Aridas
# License: MIT
from __future__ import division
from sklearn.base import clone
from sklearn.utils import check_X_y
from ..base import BaseSampler
from ..over_sampling import SMOTE
from ..over_sampling.base import BaseOverSampler
from ..under_sampling import TomekLinks
from ..utils import check_target_type
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring
[docs]@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
random_state=_random_state_docstring)
class SMOTETomek(BaseSampler):
"""Class to perform over-sampling using SMOTE and cleaning using
Tomek links.
Combine over- and under-sampling using SMOTE and Tomek links.
Read more in the :ref:`User Guide <combine>`.
Parameters
----------
{sampling_strategy}
{random_state}
smote : object, optional (default=SMOTE())
The :class:`imblearn.over_sampling.SMOTE` object to use. If not given,
a :class:`imblearn.over_sampling.SMOTE` object with default parameters
will be given.
tomek : object, optional (default=Tomek())
The :class:`imblearn.under_sampling.Tomek` object to use. If not given,
a :class:`imblearn.under_sampling.Tomek` object with default parameters
will be given.
ratio : str, dict, or callable
.. deprecated:: 0.4
Use the parameter ``sampling_strategy`` instead. It will be removed
in 0.6.
Notes
-----
The methos is presented in [1]_.
Supports multi-class resampling. Refer to SMOTE and TomekLinks regarding
the scheme which used.
See also
--------
SMOTEENN : Over-sample using SMOTE followed by under-sampling using Edited
Nearest Neighbours.
References
----------
.. [1] G. Batista, B. Bazzan, M. Monard, "Balancing Training Data for
Automated Annotation of Keywords: a Case Study," In WOB, 10-18, 2003.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from imblearn.combine import \
SMOTETomek # doctest: +NORMALIZE_WHITESPACE
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> smt = SMOTETomek(random_state=42)
>>> X_res, y_res = smt.fit_resample(X, y)
>>> print('Resampled dataset shape %s' % Counter(y_res))
Resampled dataset shape Counter({{0: 900, 1: 900}})
"""
_sampling_type = 'over-sampling'
[docs] def __init__(self,
sampling_strategy='auto',
random_state=None,
smote=None,
tomek=None,
ratio=None):
super(SMOTETomek, self).__init__()
self.sampling_strategy = sampling_strategy
self.random_state = random_state
self.smote = smote
self.tomek = tomek
self.ratio = ratio
def _validate_estimator(self):
"Private function to validate SMOTE and ENN objects"
if self.smote is not None:
if isinstance(self.smote, SMOTE):
self.smote_ = clone(self.smote)
else:
raise ValueError('smote needs to be a SMOTE object.'
'Got {} instead.'.format(type(self.smote)))
# Otherwise create a default SMOTE
else:
self.smote_ = SMOTE(
sampling_strategy=self.sampling_strategy,
random_state=self.random_state,
ratio=self.ratio)
if self.tomek is not None:
if isinstance(self.tomek, TomekLinks):
self.tomek_ = clone(self.tomek)
else:
raise ValueError('tomek needs to be a TomekLinks object.'
'Got {} instead.'.format(type(self.tomek)))
# Otherwise create a default TomekLinks
else:
self.tomek_ = TomekLinks(sampling_strategy='all')
def _fit_resample(self, X, y):
self._validate_estimator()
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
self.sampling_strategy_ = self.sampling_strategy
X_res, y_res = self.smote_.fit_resample(X, y)
return self.tomek_.fit_resample(X_res, y_res)