Source code for kenchi.outlier_detection.reconstruction_based

import numpy as np
from sklearn.decomposition import PCA as _PCA
from sklearn.utils.validation import check_is_fitted

from .base import BaseOutlierDetector

__all__ = ['PCA']


[docs]class PCA(BaseOutlierDetector): """Outlier detector using Principal Component Analysis (PCA). Parameters ---------- contamination : float, default 0.1 Proportion of outliers in the data set. Used to define the threshold. iterated_power : int, default 'auto' Number of iterations for the power method computed by svd_solver == 'randomized'. n_components : int, float, or string, default None Number of components to keep. random_state : int or RandomState instance, default None Seed of the pseudo random number generator. svd_solver : string, default 'auto' SVD solver to use. Valid solvers are ['auto'|'full'|'arpack'|'randomized']. tol : float, default 0.0 Tolerance to declare convergence for singular values computed by svd_solver == 'arpack'. whiten : bool, default False If True, the ``components_`` vectors are multiplied by the square root of n_samples and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. Attributes ---------- anomaly_score_ : array-like of shape (n_samples,) Anomaly score for each training data. contamination_ : float Actual proportion of outliers in the data set. threshold_ : float Threshold. Examples -------- >>> import numpy as np >>> from kenchi.outlier_detection import PCA >>> X = np.array([ ... [0., 0.], [1., 1.], [2., 0.], [3., -1.], [4., 0.], ... [5., 1.], [6., 0.], [7., -1.], [8., 0.], [1000., 1.] ... ]) >>> det = PCA() >>> det.fit_predict(X) array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, -1]) """ @property def components_(self): """array-like of shape (n_components, n_features): Principal axes in feature space, representing the directions of maximum variance in the data. """ return self.estimator_.components_ @property def explained_variance_(self): """array-like of shape (n_components,): Amount of variance explained by each of the selected components. """ return self.estimator_.explained_variance_ @property def explained_variance_ratio_(self): """array-like of shape (n_components,): Percentage of variance explained by each of the selected components. """ return self.estimator_.explained_variance_ratio_ @property def mean_(self): """array-like of shape (n_features,): Per-feature empirical mean, estimated from the training set. """ return self.estimator_.mean_ @property def noise_variance_(self): """float: Estimated noise covariance following the Probabilistic PCA model from Tipping and Bishop 1999. """ return self.estimator_.noise_variance_ @property def n_components_(self): """int: Estimated number of components. """ return self.estimator_.n_components_ @property def singular_values_(self): """array-like of shape (n_components,): Singular values corresponding to each of the selected components. """ return self.estimator_.singular_values_ def __init__( self, contamination=0.1, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0., whiten=False ): self.contamination = contamination self.iterated_power = iterated_power self.n_components = n_components self.random_state = random_state self.svd_solver = svd_solver self.tol = tol self.whiten = whiten def _check_is_fitted(self): super()._check_is_fitted() check_is_fitted( self, [ 'components_', 'explained_variance_', 'explained_variance_ratio_', 'mean_', 'noise_variance_', 'n_components_', 'singular_values_' ] ) def _fit(self, X): self.estimator_ = _PCA( iterated_power = self.iterated_power, n_components = self.n_components, random_state = self.random_state, svd_solver = self.svd_solver, tol = self.tol, whiten = self.whiten ).fit(X) return self def _anomaly_score(self, X): return np.sum((X - self._reconstruct(X)) ** 2, axis=1) def _reconstruct(self, X): """Apply dimensionality reduction to the given data, and transform the data back to its original space. """ return self.estimator_.inverse_transform(self.estimator_.transform(X))