Source code for kenchi.outlier_detection.reconstruction_based

import numpy as np
from sklearn.decomposition import PCA as _PCA
from sklearn.utils.validation import check_is_fitted

from .base import BaseOutlierDetector

__all__ = ['PCA']


[docs]class PCA(BaseOutlierDetector):
    """Outlier detector using Principal Component Analysis (PCA).

    Parameters
    ----------
    contamination : float, default 0.1
        Proportion of outliers in the data set. Used to define the threshold.

    iterated_power : int, default 'auto'
        Number of iterations for the power method computed by svd_solver ==
        'randomized'.

    n_components : int, float, or string, default None
        Number of components to keep.

    random_state : int or RandomState instance, default None
        Seed of the pseudo random number generator.

    svd_solver : string, default 'auto'
        SVD solver to use. Valid solvers are
        ['auto'|'full'|'arpack'|'randomized'].

    tol : float, default 0.0
        Tolerance to declare convergence for singular values computed by
        svd_solver == 'arpack'.

    whiten : bool, default False
        When True the `components_` vectors are multiplied by the square root
        of n_samples and then divided by the singular values to ensure
        uncorrelated outputs with unit component-wise variances.

    Attributes
    ----------
    anomaly_score_ : array-like of shape (n_samples,)
        Anomaly score for each training data.

    threshold_ : float
        Threshold.

    components_ : array-like of shape (n_components, n_features)
        Principal axes in feature space, representing the directions of maximum
        variance in the data.

    explained_variance_ : array-like of shape (n_components,)
        Amount of variance explained by each of the selected components.

    explained_variance_ratio_ : array-like of shape (n_components,)
        Percentage of variance explained by each of the selected components.

    mean_ : array-like of shape (n_features,)
        Per-feature empirical mean, estimated from the training set.

    noise_variance_ : float
        Estimated noise covariance following the Probabilistic PCA model from
        Tipping and Bishop 1999.

    n_components_ : int
        Estimated number of components.

    singular_values_ : array-like of shape (n_components,)
        Singular values corresponding to each of the selected components.
    """

    @property
    def components_(self):
        return self._estimator.components_

    @property
    def explained_variance_(self):
        return self._estimator.explained_variance_

    @property
    def explained_variance_ratio_(self):
        return self._estimator.explained_variance_ratio_

    @property
    def mean_(self):
        return self._estimator.mean_

    @property
    def noise_variance_(self):
        return self._estimator.noise_variance_

    @property
    def n_components_(self):
        return self._estimator.n_components_

    @property
    def singular_values_(self):
        return self._estimator.singular_values_

    def __init__(
        self, contamination=0.1, iterated_power='auto', n_components=None,
        random_state=None, svd_solver='auto', tol=0., whiten=False
    ):
        super().__init__(contamination=contamination)

        self.iterated_power = iterated_power
        self.n_components   = n_components
        self.random_state   = random_state
        self.svd_solver     = svd_solver
        self.tol            = tol
        self.whiten         = whiten

    def _fit(self, X):
        self._estimator    = _PCA(
            iterated_power = self.iterated_power,
            n_components   = self.n_components,
            random_state   = self.random_state,
            svd_solver     = self.svd_solver,
            tol            = self.tol,
            whiten         = self.whiten
        ).fit(X)

        return self

    def _anomaly_score(self, X):
        return np.sum((X - self._reconstruct(X)) ** 2, axis=1)

    def _reconstruct(self, X):
        """Apply dimensionality reduction to the given data, and transform the
        data back to its original space.
        """

        return self._estimator.inverse_transform(self._estimator.transform(X))

[docs]    def score(self, X, y=None):
        """Compute the mean log-likelihood of the given data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data.

        y : ignored

        Returns
        -------
        score : float
            Mean log-likelihood of the given data.
        """

        check_is_fitted(self, '_estimator')

        X = self._check_array(X, n_features=self._n_features, estimator=self)

        return self._estimator.score(X)