import numpy as np
from sklearn.decomposition import PCA as _PCA
from sklearn.utils.validation import check_is_fitted
from .base import BaseOutlierDetector
__all__ = ['PCA']
[docs]class PCA(BaseOutlierDetector):
"""Outlier detector using Principal Component Analysis (PCA).
Parameters
----------
contamination : float, default 0.1
Proportion of outliers in the data set. Used to define the threshold.
iterated_power : int, default 'auto'
Number of iterations for the power method computed by svd_solver ==
'randomized'.
n_components : int, float, or string, default None
Number of components to keep.
random_state : int or RandomState instance, default None
Seed of the pseudo random number generator.
svd_solver : string, default 'auto'
SVD solver to use. Valid solvers are
['auto'|'full'|'arpack'|'randomized'].
tol : float, default 0.0
Tolerance to declare convergence for singular values computed by
svd_solver == 'arpack'.
whiten : bool, default False
When True the `components_` vectors are multiplied by the square root
of n_samples and then divided by the singular values to ensure
uncorrelated outputs with unit component-wise variances.
Attributes
----------
anomaly_score_ : array-like of shape (n_samples,)
Anomaly score for each training data.
threshold_ : float
Threshold.
components_ : array-like of shape (n_components, n_features)
Principal axes in feature space, representing the directions of maximum
variance in the data.
explained_variance_ : array-like of shape (n_components,)
Amount of variance explained by each of the selected components.
explained_variance_ratio_ : array-like of shape (n_components,)
Percentage of variance explained by each of the selected components.
mean_ : array-like of shape (n_features,)
Per-feature empirical mean, estimated from the training set.
noise_variance_ : float
Estimated noise covariance following the Probabilistic PCA model from
Tipping and Bishop 1999.
n_components_ : int
Estimated number of components.
singular_values_ : array-like of shape (n_components,)
Singular values corresponding to each of the selected components.
"""
@property
def components_(self):
return self._estimator.components_
@property
def explained_variance_(self):
return self._estimator.explained_variance_
@property
def explained_variance_ratio_(self):
return self._estimator.explained_variance_ratio_
@property
def mean_(self):
return self._estimator.mean_
@property
def noise_variance_(self):
return self._estimator.noise_variance_
@property
def n_components_(self):
return self._estimator.n_components_
@property
def singular_values_(self):
return self._estimator.singular_values_
def __init__(
self, contamination=0.1, iterated_power='auto', n_components=None,
random_state=None, svd_solver='auto', tol=0., whiten=False
):
super().__init__(contamination=contamination)
self.iterated_power = iterated_power
self.n_components = n_components
self.random_state = random_state
self.svd_solver = svd_solver
self.tol = tol
self.whiten = whiten
def _fit(self, X):
self._estimator = _PCA(
iterated_power = self.iterated_power,
n_components = self.n_components,
random_state = self.random_state,
svd_solver = self.svd_solver,
tol = self.tol,
whiten = self.whiten
).fit(X)
return self
def _anomaly_score(self, X):
return np.sum((X - self._reconstruct(X)) ** 2, axis=1)
def _reconstruct(self, X):
"""Apply dimensionality reduction to the given data, and transform the
data back to its original space.
"""
return self._estimator.inverse_transform(self._estimator.transform(X))
[docs] def score(self, X, y=None):
"""Compute the mean log-likelihood of the given data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data.
y : ignored
Returns
-------
score : float
Mean log-likelihood of the given data.
"""
check_is_fitted(self, '_estimator')
X = self._check_array(X, n_features=self._n_features, estimator=self)
return self._estimator.score(X)