Source code for imblearn.ensemble._weight_boosting

import numbers
from copy import deepcopy

import numpy as np

from sklearn.base import clone
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble.base import _set_random_states
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import safe_indexing

from ..under_sampling.base import BaseUnderSampler
from ..under_sampling import RandomUnderSampler
from ..pipeline import make_pipeline
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring


[docs]@Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring) class RUSBoostClassifier(AdaBoostClassifier): """Random under-sampling integrating in the learning of an AdaBoost classifier. During learning, the problem of class balancing is alleviated by random under-sampling the sample at each iteration of the boosting algorithm. Read more in the :ref:`User Guide <boosting>`. Parameters ---------- base_estimator : object, optional (default=DecisionTreeClassifier) The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper `classes_` and `n_classes_` attributes. n_estimators : integer, optional (default=50) The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. learning_rate : float, optional (default=1.) Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``. algorithm : {{'SAMME', 'SAMME.R'}}, optional (default='SAMME.R') If 'SAMME.R' then use the SAMME.R real boosting algorithm. ``base_estimator`` must support calculation of class probabilities. If 'SAMME' then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations. {sampling_strategy} replacement : bool, optional (default=False) Whether or not to sample randomly with replacement or not. {random_state} Attributes ---------- estimators_ : list of classifiers The collection of fitted sub-estimators. samplers_ : list of RandomUnderSampler The collection of fitted samplers. pipelines_ : list of Pipeline. The collection of fitted pipelines (samplers + trees). classes_ : ndarray, shape (n_classes,) The classes labels. n_classes_ : int The number of classes. estimator_weights_ : ndarray, shape (n_estimator,) Weights for each estimator in the boosted ensemble. estimator_errors_ : ndarray, shape (n_estimator,) Classification error for each estimator in the boosted ensemble. feature_importances_ : ndarray, shape (n_features,) The feature importances if supported by the ``base_estimator``. See also -------- BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier References ---------- .. [1] Seiffert, C., Khoshgoftaar, T. M., Van Hulse, J., & Napolitano, A. "RUSBoost: A hybrid approach to alleviating class imbalance." IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans 40.1 (2010): 185-197. Examples -------- >>> from imblearn.ensemble import RUSBoostClassifier >>> from sklearn.datasets import make_classification >>> >>> X, y = make_classification(n_samples=1000, n_classes=3, ... n_informative=4, weights=[0.2, 0.3, 0.5], ... random_state=0) >>> clf = RUSBoostClassifier(random_state=0) >>> clf.fit(X, y) # doctest: +ELLIPSIS RUSBoostClassifier(...) >>> clf.predict(X) # doctest: +ELLIPSIS array([...]) """
[docs] def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1., algorithm='SAMME.R', sampling_strategy='auto', replacement=False, random_state=None): super(RUSBoostClassifier, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, random_state=random_state) self.sampling_strategy = sampling_strategy self.replacement = replacement
[docs] def fit(self, X, y, sample_weight=None): """Build a boosted classifier from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. Sparse matrix can be CSC, CSR, COO, DOK, or LIL. DOK and LIL are converted to CSR. y : array-like, shape (n_samples,) The target values (class labels). sample_weight : array-like, shape (n_samples,), optional Sample weights. If None, the sample weights are initialized to ``1 / n_samples``. Returns ------- self : object Returns self. """ self.samplers_ = [] self.pipelines_ = [] super(RUSBoostClassifier, self).fit(X, y, sample_weight) return self
def _validate_estimator(self, default=DecisionTreeClassifier()): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): raise ValueError("n_estimators must be an integer, " "got {0}.".format(type(self.n_estimators))) if self.n_estimators <= 0: raise ValueError("n_estimators must be greater than zero, " "got {0}.".format(self.n_estimators)) if self.base_estimator is not None: self.base_estimator_ = clone(self.base_estimator) else: self.base_estimator_ = clone(default) self.base_sampler_ = RandomUnderSampler( sampling_strategy=self.sampling_strategy, replacement=self.replacement) def _make_sampler_estimator(self, append=True, random_state=None): """Make and configure a copy of the `base_estimator_` attribute. Warning: This method should be used to properly instantiate new sub-estimators. """ estimator = clone(self.base_estimator_) estimator.set_params(**dict((p, getattr(self, p)) for p in self.estimator_params)) sampler = clone(self.base_sampler_) if random_state is not None: _set_random_states(estimator, random_state) _set_random_states(sampler, random_state) if append: self.estimators_.append(estimator) self.samplers_.append(sampler) self.pipelines_.append(make_pipeline(deepcopy(sampler), deepcopy(estimator))) return estimator, sampler def _boost_real(self, iboost, X, y, sample_weight, random_state): """Implement a single boost using the SAMME.R real algorithm.""" estimator, sampler = self._make_sampler_estimator( random_state=random_state) X_res, y_res = sampler.fit_resample(X, y) sample_weight_res = safe_indexing(sample_weight, sampler.sample_indices_) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) y_predict_proba = estimator.predict_proba(X) if iboost == 0: self.classes_ = getattr(estimator, 'classes_', None) self.n_classes_ = len(self.classes_) y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1), axis=0) # Instances incorrectly classified incorrect = y_predict != y # Error fraction estimator_error = np.mean( np.average(incorrect, weights=sample_weight, axis=0)) # Stop if classification is perfect if estimator_error <= 0: return sample_weight, 1., 0. # Construct y coding as described in Zhu et al [2]: # # y_k = 1 if c == k else -1 / (K - 1) # # where K == n_classes_ and c, k in [0, K) are indices along the second # axis of the y coding with c being the index corresponding to the true # class label. n_classes = self.n_classes_ classes = self.classes_ y_codes = np.array([-1. / (n_classes - 1), 1.]) y_coding = y_codes.take(classes == y[:, np.newaxis]) # Displace zero probabilities so the log is defined. # Also fix negative elements which may occur with # negative sample weights. proba = y_predict_proba # alias for readability np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba) # Boost weight using multi-class AdaBoost SAMME.R alg estimator_weight = (-1. * self.learning_rate * ((n_classes - 1.) / n_classes) * (y_coding * np.log(y_predict_proba)).sum(axis=1)) # Only boost the weights if it will fit again if not iboost == self.n_estimators - 1: # Only boost positive weights sample_weight *= np.exp(estimator_weight * ((sample_weight > 0) | (estimator_weight < 0))) return sample_weight, 1., estimator_error def _boost_discrete(self, iboost, X, y, sample_weight, random_state): """Implement a single boost using the SAMME discrete algorithm.""" estimator, sampler = self._make_sampler_estimator( random_state=random_state) X_res, y_res = sampler.fit_resample(X, y) sample_weight_res = safe_indexing(sample_weight, sampler.sample_indices_) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) y_predict = estimator.predict(X) if iboost == 0: self.classes_ = getattr(estimator, 'classes_', None) self.n_classes_ = len(self.classes_) # Instances incorrectly classified incorrect = y_predict != y # Error fraction estimator_error = np.mean( np.average(incorrect, weights=sample_weight, axis=0)) # Stop if classification is perfect if estimator_error <= 0: return sample_weight, 1., 0. n_classes = self.n_classes_ # Stop if the error is at least as bad as random guessing if estimator_error >= 1. - (1. / n_classes): self.estimators_.pop(-1) self.samplers_.pop(-1) self.pipelines_.pop(-1) if len(self.estimators_) == 0: raise ValueError('BaseClassifier in AdaBoostClassifier ' 'ensemble is worse than random, ensemble ' 'can not be fit.') return None, None, None # Boost weight using multi-class AdaBoost SAMME alg estimator_weight = self.learning_rate * ( np.log((1. - estimator_error) / estimator_error) + np.log(n_classes - 1.)) # Only boost the weights if I will fit again if not iboost == self.n_estimators - 1: # Only boost positive weights sample_weight *= np.exp(estimator_weight * incorrect * ((sample_weight > 0) | (estimator_weight < 0))) return sample_weight, estimator_weight, estimator_error