Source code for kenchi.outlier_detection.ensemble

from sklearn.ensemble import IsolationForest
from sklearn.utils.validation import check_is_fitted

from .base import BaseOutlierDetector

__all__ = ['IForest']


[docs]class IForest(BaseOutlierDetector): """Isolation forest (iForest). Parameters ---------- bootstrap : bool, False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed. contamination : float, default 'auto' Proportion of outliers in the data set. Used to define the threshold. max_features : int or float, default 1.0 Number of features to draw from X to train each base estimator. max_samples : int ,float or str, default 'auto' Number of samples to draw from X to train each base estimator. n_estimators : int, default 100 Number of base estimators in the ensemble. n_jobs : int Number of jobs to run in parallel. If -1, then the number of jobs is set to the number of CPU cores. random_state : int or RandomState instance, default None Seed of the pseudo random number generator. Attributes ---------- anomaly_score_ : array-like of shape (n_samples,) Anomaly score for each training data. contamination_ : float Actual proportion of outliers in the data set. threshold_ : float Threshold. References ---------- .. [#liu08] Liu, F. T., Ting, K. M., and Zhou, Z.-H., "Isolation forest," In Proceedings of ICDM, pp. 413-422, 2008. Examples -------- >>> import numpy as np >>> from kenchi.outlier_detection import IForest >>> X = np.array([ ... [0., 0.], [1., 1.], [2., 0.], [3., -1.], [4., 0.], ... [5., 1.], [6., 0.], [7., -1.], [8., 0.], [1000., 1.] ... ]) >>> det = IForest(random_state=0) >>> det.fit_predict(X) array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, -1]) """ @property def estimators_(self): """list: Collection of fitted sub-estimators. """ return self.estimator_.estimators_ @property def estimators_samples_(self): """int: Subset of drawn samples for each base estimator. """ return self.estimator_.estimators_samples_ @property def max_samples_(self): """int: Actual number of samples. """ return self.estimator_.max_samples_ def __init__( self, bootstrap=False, contamination='auto', max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=1, random_state=None ): self.bootstrap = bootstrap self.contamination = contamination self.max_features = max_features self.max_samples = max_samples self.n_estimators = n_estimators self.n_jobs = n_jobs self.random_state = random_state def _check_is_fitted(self): super()._check_is_fitted() check_is_fitted( self, ['estimators_', 'estimators_samples_', 'max_samples_'] ) def _get_threshold(self): return -self.estimator_.offset_ def _fit(self, X): self.estimator_ = IsolationForest( behaviour = 'new', bootstrap = self.bootstrap, contamination = self.contamination, max_features = self.max_features, max_samples = self.max_samples, n_estimators = self.n_estimators, n_jobs = self.n_jobs, random_state = self.random_state ).fit(X) return self def _anomaly_score(self, X): return -self.estimator_.score_samples(X)